1// Copyright 2007 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Compiled regular expression representation.
6// Tested by compile_test.cc
7
8#include "re2/prog.h"
9
10#if defined(__AVX2__)
11#include <immintrin.h>
12#ifdef _MSC_VER
13#include <intrin.h>
14#endif
15#endif
16#include <stdint.h>
17#include <string.h>
18#include <algorithm>
19#include <memory>
20#include <utility>
21
22#include "absl/base/macros.h"
23#include "absl/strings/str_format.h"
24#include "util/logging.h"
25#include "re2/bitmap256.h"
26
27namespace re2 {
28
29// Constructors per Inst opcode
30
31void Prog::Inst::InitAlt(uint32_t out, uint32_t out1) {
32 DCHECK_EQ(out_opcode_, 0);
33 set_out_opcode(out, kInstAlt);
34 out1_ = out1;
35}
36
37void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) {
38 DCHECK_EQ(out_opcode_, 0);
39 set_out_opcode(out, kInstByteRange);
40 lo_ = lo & 0xFF;
41 hi_ = hi & 0xFF;
42 hint_foldcase_ = foldcase&1;
43}
44
45void Prog::Inst::InitCapture(int cap, uint32_t out) {
46 DCHECK_EQ(out_opcode_, 0);
47 set_out_opcode(out, kInstCapture);
48 cap_ = cap;
49}
50
51void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32_t out) {
52 DCHECK_EQ(out_opcode_, 0);
53 set_out_opcode(out, kInstEmptyWidth);
54 empty_ = empty;
55}
56
57void Prog::Inst::InitMatch(int32_t id) {
58 DCHECK_EQ(out_opcode_, 0);
59 set_opcode(kInstMatch);
60 match_id_ = id;
61}
62
63void Prog::Inst::InitNop(uint32_t out) {
64 DCHECK_EQ(out_opcode_, 0);
65 set_opcode(kInstNop);
66}
67
68void Prog::Inst::InitFail() {
69 DCHECK_EQ(out_opcode_, 0);
70 set_opcode(kInstFail);
71}
72
73std::string Prog::Inst::Dump() {
74 switch (opcode()) {
75 default:
76 return absl::StrFormat("opcode %d", static_cast<int>(opcode()));
77
78 case kInstAlt:
79 return absl::StrFormat("alt -> %d | %d", out(), out1_);
80
81 case kInstAltMatch:
82 return absl::StrFormat("altmatch -> %d | %d", out(), out1_);
83
84 case kInstByteRange:
85 return absl::StrFormat("byte%s [%02x-%02x] %d -> %d",
86 foldcase() ? "/i" : "",
87 lo_, hi_, hint(), out());
88
89 case kInstCapture:
90 return absl::StrFormat("capture %d -> %d", cap_, out());
91
92 case kInstEmptyWidth:
93 return absl::StrFormat("emptywidth %#x -> %d",
94 static_cast<int>(empty_), out());
95
96 case kInstMatch:
97 return absl::StrFormat("match! %d", match_id());
98
99 case kInstNop:
100 return absl::StrFormat("nop -> %d", out());
101
102 case kInstFail:
103 return absl::StrFormat("fail");
104 }
105}
106
107Prog::Prog()
108 : anchor_start_(false),
109 anchor_end_(false),
110 reversed_(false),
111 did_flatten_(false),
112 did_onepass_(false),
113 start_(0),
114 start_unanchored_(0),
115 size_(0),
116 bytemap_range_(0),
117 prefix_foldcase_(false),
118 prefix_size_(0),
119 list_count_(0),
120 bit_state_text_max_size_(0),
121 dfa_mem_(0),
122 dfa_first_(NULL),
123 dfa_longest_(NULL) {
124}
125
126Prog::~Prog() {
127 DeleteDFA(dfa_longest_);
128 DeleteDFA(dfa_first_);
129 if (prefix_foldcase_)
130 delete[] prefix_dfa_;
131}
132
133typedef SparseSet Workq;
134
135static inline void AddToQueue(Workq* q, int id) {
136 if (id != 0)
137 q->insert(id);
138}
139
140static std::string ProgToString(Prog* prog, Workq* q) {
141 std::string s;
142 for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
143 int id = *i;
144 Prog::Inst* ip = prog->inst(id);
145 s += absl::StrFormat("%d. %s\n", id, ip->Dump());
146 AddToQueue(q, ip->out());
147 if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
148 AddToQueue(q, ip->out1());
149 }
150 return s;
151}
152
153static std::string FlattenedProgToString(Prog* prog, int start) {
154 std::string s;
155 for (int id = start; id < prog->size(); id++) {
156 Prog::Inst* ip = prog->inst(id);
157 if (ip->last())
158 s += absl::StrFormat("%d. %s\n", id, ip->Dump());
159 else
160 s += absl::StrFormat("%d+ %s\n", id, ip->Dump());
161 }
162 return s;
163}
164
165std::string Prog::Dump() {
166 if (did_flatten_)
167 return FlattenedProgToString(this, start_);
168
169 Workq q(size_);
170 AddToQueue(&q, start_);
171 return ProgToString(this, &q);
172}
173
174std::string Prog::DumpUnanchored() {
175 if (did_flatten_)
176 return FlattenedProgToString(this, start_unanchored_);
177
178 Workq q(size_);
179 AddToQueue(&q, start_unanchored_);
180 return ProgToString(this, &q);
181}
182
183std::string Prog::DumpByteMap() {
184 std::string map;
185 for (int c = 0; c < 256; c++) {
186 int b = bytemap_[c];
187 int lo = c;
188 while (c < 256-1 && bytemap_[c+1] == b)
189 c++;
190 int hi = c;
191 map += absl::StrFormat("[%02x-%02x] -> %d\n", lo, hi, b);
192 }
193 return map;
194}
195
196// Is ip a guaranteed match at end of text, perhaps after some capturing?
197static bool IsMatch(Prog* prog, Prog::Inst* ip) {
198 for (;;) {
199 switch (ip->opcode()) {
200 default:
201 LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
202 return false;
203
204 case kInstAlt:
205 case kInstAltMatch:
206 case kInstByteRange:
207 case kInstFail:
208 case kInstEmptyWidth:
209 return false;
210
211 case kInstCapture:
212 case kInstNop:
213 ip = prog->inst(ip->out());
214 break;
215
216 case kInstMatch:
217 return true;
218 }
219 }
220}
221
222// Peep-hole optimizer.
223void Prog::Optimize() {
224 Workq q(size_);
225
226 // Eliminate nops. Most are taken out during compilation
227 // but a few are hard to avoid.
228 q.clear();
229 AddToQueue(&q, start_);
230 for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
231 int id = *i;
232
233 Inst* ip = inst(id);
234 int j = ip->out();
235 Inst* jp;
236 while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
237 j = jp->out();
238 }
239 ip->set_out(j);
240 AddToQueue(&q, ip->out());
241
242 if (ip->opcode() == kInstAlt) {
243 j = ip->out1();
244 while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
245 j = jp->out();
246 }
247 ip->out1_ = j;
248 AddToQueue(&q, ip->out1());
249 }
250 }
251
252 // Insert kInstAltMatch instructions
253 // Look for
254 // ip: Alt -> j | k
255 // j: ByteRange [00-FF] -> ip
256 // k: Match
257 // or the reverse (the above is the greedy one).
258 // Rewrite Alt to AltMatch.
259 q.clear();
260 AddToQueue(&q, start_);
261 for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
262 int id = *i;
263 Inst* ip = inst(id);
264 AddToQueue(&q, ip->out());
265 if (ip->opcode() == kInstAlt)
266 AddToQueue(&q, ip->out1());
267
268 if (ip->opcode() == kInstAlt) {
269 Inst* j = inst(ip->out());
270 Inst* k = inst(ip->out1());
271 if (j->opcode() == kInstByteRange && j->out() == id &&
272 j->lo() == 0x00 && j->hi() == 0xFF &&
273 IsMatch(this, k)) {
274 ip->set_opcode(kInstAltMatch);
275 continue;
276 }
277 if (IsMatch(this, j) &&
278 k->opcode() == kInstByteRange && k->out() == id &&
279 k->lo() == 0x00 && k->hi() == 0xFF) {
280 ip->set_opcode(kInstAltMatch);
281 }
282 }
283 }
284}
285
286uint32_t Prog::EmptyFlags(absl::string_view text, const char* p) {
287 int flags = 0;
288
289 // ^ and \A
290 if (p == text.data())
291 flags |= kEmptyBeginText | kEmptyBeginLine;
292 else if (p[-1] == '\n')
293 flags |= kEmptyBeginLine;
294
295 // $ and \z
296 if (p == text.data() + text.size())
297 flags |= kEmptyEndText | kEmptyEndLine;
298 else if (p < text.data() + text.size() && p[0] == '\n')
299 flags |= kEmptyEndLine;
300
301 // \b and \B
302 if (p == text.data() && p == text.data() + text.size()) {
303 // no word boundary here
304 } else if (p == text.data()) {
305 if (IsWordChar(p[0]))
306 flags |= kEmptyWordBoundary;
307 } else if (p == text.data() + text.size()) {
308 if (IsWordChar(p[-1]))
309 flags |= kEmptyWordBoundary;
310 } else {
311 if (IsWordChar(p[-1]) != IsWordChar(p[0]))
312 flags |= kEmptyWordBoundary;
313 }
314 if (!(flags & kEmptyWordBoundary))
315 flags |= kEmptyNonWordBoundary;
316
317 return flags;
318}
319
320// ByteMapBuilder implements a coloring algorithm.
321//
322// The first phase is a series of "mark and merge" batches: we mark one or more
323// [lo-hi] ranges, then merge them into our internal state. Batching is not for
324// performance; rather, it means that the ranges are treated indistinguishably.
325//
326// Internally, the ranges are represented using a bitmap that stores the splits
327// and a vector that stores the colors; both of them are indexed by the ranges'
328// last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at
329// hi (if not already split), then recolor each range in between. The color map
330// (i.e. from the old color to the new color) is maintained for the lifetime of
331// the batch and so underpins this somewhat obscure approach to set operations.
332//
333// The second phase builds the bytemap from our internal state: we recolor each
334// range, then store the new color (which is now the byte class) in each of the
335// corresponding array elements. Finally, we output the number of byte classes.
336class ByteMapBuilder {
337 public:
338 ByteMapBuilder() {
339 // Initial state: the [0-255] range has color 256.
340 // This will avoid problems during the second phase,
341 // in which we assign byte classes numbered from 0.
342 splits_.Set(255);
343 colors_[255] = 256;
344 nextcolor_ = 257;
345 }
346
347 void Mark(int lo, int hi);
348 void Merge();
349 void Build(uint8_t* bytemap, int* bytemap_range);
350
351 private:
352 int Recolor(int oldcolor);
353
354 Bitmap256 splits_;
355 int colors_[256];
356 int nextcolor_;
357 std::vector<std::pair<int, int>> colormap_;
358 std::vector<std::pair<int, int>> ranges_;
359
360 ByteMapBuilder(const ByteMapBuilder&) = delete;
361 ByteMapBuilder& operator=(const ByteMapBuilder&) = delete;
362};
363
364void ByteMapBuilder::Mark(int lo, int hi) {
365 DCHECK_GE(lo, 0);
366 DCHECK_GE(hi, 0);
367 DCHECK_LE(lo, 255);
368 DCHECK_LE(hi, 255);
369 DCHECK_LE(lo, hi);
370
371 // Ignore any [0-255] ranges. They cause us to recolor every range, which
372 // has no effect on the eventual result and is therefore a waste of time.
373 if (lo == 0 && hi == 255)
374 return;
375
376 ranges_.emplace_back(lo, hi);
377}
378
379void ByteMapBuilder::Merge() {
380 for (std::vector<std::pair<int, int>>::const_iterator it = ranges_.begin();
381 it != ranges_.end();
382 ++it) {
383 int lo = it->first-1;
384 int hi = it->second;
385
386 if (0 <= lo && !splits_.Test(lo)) {
387 splits_.Set(lo);
388 int next = splits_.FindNextSetBit(lo+1);
389 colors_[lo] = colors_[next];
390 }
391 if (!splits_.Test(hi)) {
392 splits_.Set(hi);
393 int next = splits_.FindNextSetBit(hi+1);
394 colors_[hi] = colors_[next];
395 }
396
397 int c = lo+1;
398 while (c < 256) {
399 int next = splits_.FindNextSetBit(c);
400 colors_[next] = Recolor(colors_[next]);
401 if (next == hi)
402 break;
403 c = next+1;
404 }
405 }
406 colormap_.clear();
407 ranges_.clear();
408}
409
410void ByteMapBuilder::Build(uint8_t* bytemap, int* bytemap_range) {
411 // Assign byte classes numbered from 0.
412 nextcolor_ = 0;
413
414 int c = 0;
415 while (c < 256) {
416 int next = splits_.FindNextSetBit(c);
417 uint8_t b = static_cast<uint8_t>(Recolor(colors_[next]));
418 while (c <= next) {
419 bytemap[c] = b;
420 c++;
421 }
422 }
423
424 *bytemap_range = nextcolor_;
425}
426
427int ByteMapBuilder::Recolor(int oldcolor) {
428 // Yes, this is a linear search. There can be at most 256
429 // colors and there will typically be far fewer than that.
430 // Also, we need to consider keys *and* values in order to
431 // avoid recoloring a given range more than once per batch.
432 std::vector<std::pair<int, int>>::const_iterator it =
433 std::find_if(colormap_.begin(), colormap_.end(),
434 [=](const std::pair<int, int>& kv) -> bool {
435 return kv.first == oldcolor || kv.second == oldcolor;
436 });
437 if (it != colormap_.end())
438 return it->second;
439 int newcolor = nextcolor_;
440 nextcolor_++;
441 colormap_.emplace_back(oldcolor, newcolor);
442 return newcolor;
443}
444
445void Prog::ComputeByteMap() {
446 // Fill in bytemap with byte classes for the program.
447 // Ranges of bytes that are treated indistinguishably
448 // will be mapped to a single byte class.
449 ByteMapBuilder builder;
450
451 // Don't repeat the work for ^ and $.
452 bool marked_line_boundaries = false;
453 // Don't repeat the work for \b and \B.
454 bool marked_word_boundaries = false;
455
456 for (int id = 0; id < size(); id++) {
457 Inst* ip = inst(id);
458 if (ip->opcode() == kInstByteRange) {
459 int lo = ip->lo();
460 int hi = ip->hi();
461 builder.Mark(lo, hi);
462 if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
463 int foldlo = lo;
464 int foldhi = hi;
465 if (foldlo < 'a')
466 foldlo = 'a';
467 if (foldhi > 'z')
468 foldhi = 'z';
469 if (foldlo <= foldhi) {
470 foldlo += 'A' - 'a';
471 foldhi += 'A' - 'a';
472 builder.Mark(foldlo, foldhi);
473 }
474 }
475 // If this Inst is not the last Inst in its list AND the next Inst is
476 // also a ByteRange AND the Insts have the same out, defer the merge.
477 if (!ip->last() &&
478 inst(id+1)->opcode() == kInstByteRange &&
479 ip->out() == inst(id+1)->out())
480 continue;
481 builder.Merge();
482 } else if (ip->opcode() == kInstEmptyWidth) {
483 if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine) &&
484 !marked_line_boundaries) {
485 builder.Mark('\n', '\n');
486 builder.Merge();
487 marked_line_boundaries = true;
488 }
489 if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary) &&
490 !marked_word_boundaries) {
491 // We require two batches here: the first for ranges that are word
492 // characters, the second for ranges that are not word characters.
493 for (bool isword : {true, false}) {
494 int j;
495 for (int i = 0; i < 256; i = j) {
496 for (j = i + 1; j < 256 &&
497 Prog::IsWordChar(static_cast<uint8_t>(i)) ==
498 Prog::IsWordChar(static_cast<uint8_t>(j));
499 j++)
500 ;
501 if (Prog::IsWordChar(static_cast<uint8_t>(i)) == isword)
502 builder.Mark(i, j - 1);
503 }
504 builder.Merge();
505 }
506 marked_word_boundaries = true;
507 }
508 }
509 }
510
511 builder.Build(bytemap_, &bytemap_range_);
512
513 if (0) { // For debugging, use trivial bytemap.
514 LOG(ERROR) << "Using trivial bytemap.";
515 for (int i = 0; i < 256; i++)
516 bytemap_[i] = static_cast<uint8_t>(i);
517 bytemap_range_ = 256;
518 }
519}
520
521// Prog::Flatten() implements a graph rewriting algorithm.
522//
523// The overall process is similar to epsilon removal, but retains some epsilon
524// transitions: those from Capture and EmptyWidth instructions; and those from
525// nullable subexpressions. (The latter avoids quadratic blowup in transitions
526// in the worst case.) It might be best thought of as Alt instruction elision.
527//
528// In conceptual terms, it divides the Prog into "trees" of instructions, then
529// traverses the "trees" in order to produce "lists" of instructions. A "tree"
530// is one or more instructions that grow from one "root" instruction to one or
531// more "leaf" instructions; if a "tree" has exactly one instruction, then the
532// "root" is also the "leaf". In most cases, a "root" is the successor of some
533// "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction)
534// and is considered a "successor root". A "leaf" can be a ByteRange, Capture,
535// EmptyWidth or Match instruction. However, this is insufficient for handling
536// nested nullable subexpressions correctly, so in some cases, a "root" is the
537// dominator of the instructions reachable from some "successor root" (i.e. it
538// has an unreachable predecessor) and is considered a "dominator root". Since
539// only Alt instructions can be "dominator roots" (other instructions would be
540// "leaves"), only Alt instructions are required to be marked as predecessors.
541//
542// Dividing the Prog into "trees" comprises two passes: marking the "successor
543// roots" and the predecessors; and marking the "dominator roots". Sorting the
544// "successor roots" by their bytecode offsets enables iteration in order from
545// greatest to least during the second pass; by working backwards in this case
546// and flooding the graph no further than "leaves" and already marked "roots",
547// it becomes possible to mark "dominator roots" without doing excessive work.
548//
549// Traversing the "trees" is just iterating over the "roots" in order of their
550// marking and flooding the graph no further than "leaves" and "roots". When a
551// "leaf" is reached, the instruction is copied with its successor remapped to
552// its "root" number. When a "root" is reached, a Nop instruction is generated
553// with its successor remapped similarly. As each "list" is produced, its last
554// instruction is marked as such. After all of the "lists" have been produced,
555// a pass over their instructions remaps their successors to bytecode offsets.
556void Prog::Flatten() {
557 if (did_flatten_)
558 return;
559 did_flatten_ = true;
560
561 // Scratch structures. It's important that these are reused by functions
562 // that we call in loops because they would thrash the heap otherwise.
563 SparseSet reachable(size());
564 std::vector<int> stk;
565 stk.reserve(size());
566
567 // First pass: Marks "successor roots" and predecessors.
568 // Builds the mapping from inst-ids to root-ids.
569 SparseArray<int> rootmap(size());
570 SparseArray<int> predmap(size());
571 std::vector<std::vector<int>> predvec;
572 MarkSuccessors(&rootmap, &predmap, &predvec, &reachable, &stk);
573
574 // Second pass: Marks "dominator roots".
575 SparseArray<int> sorted(rootmap);
576 std::sort(sorted.begin(), sorted.end(), sorted.less);
577 for (SparseArray<int>::const_iterator i = sorted.end() - 1;
578 i != sorted.begin();
579 --i) {
580 if (i->index() != start_unanchored() && i->index() != start())
581 MarkDominator(i->index(), &rootmap, &predmap, &predvec, &reachable, &stk);
582 }
583
584 // Third pass: Emits "lists". Remaps outs to root-ids.
585 // Builds the mapping from root-ids to flat-ids.
586 std::vector<int> flatmap(rootmap.size());
587 std::vector<Inst> flat;
588 flat.reserve(size());
589 for (SparseArray<int>::const_iterator i = rootmap.begin();
590 i != rootmap.end();
591 ++i) {
592 flatmap[i->value()] = static_cast<int>(flat.size());
593 EmitList(i->index(), &rootmap, &flat, &reachable, &stk);
594 flat.back().set_last();
595 // We have the bounds of the "list", so this is the
596 // most convenient point at which to compute hints.
597 ComputeHints(&flat, flatmap[i->value()], static_cast<int>(flat.size()));
598 }
599
600 list_count_ = static_cast<int>(flatmap.size());
601 for (int i = 0; i < kNumInst; i++)
602 inst_count_[i] = 0;
603
604 // Fourth pass: Remaps outs to flat-ids.
605 // Counts instructions by opcode.
606 for (int id = 0; id < static_cast<int>(flat.size()); id++) {
607 Inst* ip = &flat[id];
608 if (ip->opcode() != kInstAltMatch) // handled in EmitList()
609 ip->set_out(flatmap[ip->out()]);
610 inst_count_[ip->opcode()]++;
611 }
612
613#if !defined(NDEBUG)
614 // Address a `-Wunused-but-set-variable' warning from Clang 13.x.
615 size_t total = 0;
616 for (int i = 0; i < kNumInst; i++)
617 total += inst_count_[i];
618 CHECK_EQ(total, flat.size());
619#endif
620
621 // Remap start_unanchored and start.
622 if (start_unanchored() == 0) {
623 DCHECK_EQ(start(), 0);
624 } else if (start_unanchored() == start()) {
625 set_start_unanchored(flatmap[1]);
626 set_start(flatmap[1]);
627 } else {
628 set_start_unanchored(flatmap[1]);
629 set_start(flatmap[2]);
630 }
631
632 // Finally, replace the old instructions with the new instructions.
633 size_ = static_cast<int>(flat.size());
634 inst_ = PODArray<Inst>(size_);
635 memmove(inst_.data(), flat.data(), size_*sizeof inst_[0]);
636
637 // Populate the list heads for BitState.
638 // 512 instructions limits the memory footprint to 1KiB.
639 if (size_ <= 512) {
640 list_heads_ = PODArray<uint16_t>(size_);
641 // 0xFF makes it more obvious if we try to look up a non-head.
642 memset(list_heads_.data(), 0xFF, size_*sizeof list_heads_[0]);
643 for (int i = 0; i < list_count_; ++i)
644 list_heads_[flatmap[i]] = i;
645 }
646
647 // BitState allocates a bitmap of size list_count_ * (text.size()+1)
648 // for tracking pairs of possibilities that it has already explored.
649 const size_t kBitStateBitmapMaxSize = 256*1024; // max size in bits
650 bit_state_text_max_size_ = kBitStateBitmapMaxSize / list_count_ - 1;
651}
652
653void Prog::MarkSuccessors(SparseArray<int>* rootmap,
654 SparseArray<int>* predmap,
655 std::vector<std::vector<int>>* predvec,
656 SparseSet* reachable, std::vector<int>* stk) {
657 // Mark the kInstFail instruction.
658 rootmap->set_new(0, rootmap->size());
659
660 // Mark the start_unanchored and start instructions.
661 if (!rootmap->has_index(start_unanchored()))
662 rootmap->set_new(start_unanchored(), rootmap->size());
663 if (!rootmap->has_index(start()))
664 rootmap->set_new(start(), rootmap->size());
665
666 reachable->clear();
667 stk->clear();
668 stk->push_back(start_unanchored());
669 while (!stk->empty()) {
670 int id = stk->back();
671 stk->pop_back();
672 Loop:
673 if (reachable->contains(id))
674 continue;
675 reachable->insert_new(id);
676
677 Inst* ip = inst(id);
678 switch (ip->opcode()) {
679 default:
680 LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
681 break;
682
683 case kInstAltMatch:
684 case kInstAlt:
685 // Mark this instruction as a predecessor of each out.
686 for (int out : {ip->out(), ip->out1()}) {
687 if (!predmap->has_index(out)) {
688 predmap->set_new(out, static_cast<int>(predvec->size()));
689 predvec->emplace_back();
690 }
691 (*predvec)[predmap->get_existing(out)].emplace_back(id);
692 }
693 stk->push_back(ip->out1());
694 id = ip->out();
695 goto Loop;
696
697 case kInstByteRange:
698 case kInstCapture:
699 case kInstEmptyWidth:
700 // Mark the out of this instruction as a "root".
701 if (!rootmap->has_index(ip->out()))
702 rootmap->set_new(ip->out(), rootmap->size());
703 id = ip->out();
704 goto Loop;
705
706 case kInstNop:
707 id = ip->out();
708 goto Loop;
709
710 case kInstMatch:
711 case kInstFail:
712 break;
713 }
714 }
715}
716
717void Prog::MarkDominator(int root, SparseArray<int>* rootmap,
718 SparseArray<int>* predmap,
719 std::vector<std::vector<int>>* predvec,
720 SparseSet* reachable, std::vector<int>* stk) {
721 reachable->clear();
722 stk->clear();
723 stk->push_back(root);
724 while (!stk->empty()) {
725 int id = stk->back();
726 stk->pop_back();
727 Loop:
728 if (reachable->contains(id))
729 continue;
730 reachable->insert_new(id);
731
732 if (id != root && rootmap->has_index(id)) {
733 // We reached another "tree" via epsilon transition.
734 continue;
735 }
736
737 Inst* ip = inst(id);
738 switch (ip->opcode()) {
739 default:
740 LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
741 break;
742
743 case kInstAltMatch:
744 case kInstAlt:
745 stk->push_back(ip->out1());
746 id = ip->out();
747 goto Loop;
748
749 case kInstByteRange:
750 case kInstCapture:
751 case kInstEmptyWidth:
752 break;
753
754 case kInstNop:
755 id = ip->out();
756 goto Loop;
757
758 case kInstMatch:
759 case kInstFail:
760 break;
761 }
762 }
763
764 for (SparseSet::const_iterator i = reachable->begin();
765 i != reachable->end();
766 ++i) {
767 int id = *i;
768 if (predmap->has_index(id)) {
769 for (int pred : (*predvec)[predmap->get_existing(id)]) {
770 if (!reachable->contains(pred)) {
771 // id has a predecessor that cannot be reached from root!
772 // Therefore, id must be a "root" too - mark it as such.
773 if (!rootmap->has_index(id))
774 rootmap->set_new(id, rootmap->size());
775 }
776 }
777 }
778 }
779}
780
781void Prog::EmitList(int root, SparseArray<int>* rootmap,
782 std::vector<Inst>* flat,
783 SparseSet* reachable, std::vector<int>* stk) {
784 reachable->clear();
785 stk->clear();
786 stk->push_back(root);
787 while (!stk->empty()) {
788 int id = stk->back();
789 stk->pop_back();
790 Loop:
791 if (reachable->contains(id))
792 continue;
793 reachable->insert_new(id);
794
795 if (id != root && rootmap->has_index(id)) {
796 // We reached another "tree" via epsilon transition. Emit a kInstNop
797 // instruction so that the Prog does not become quadratically larger.
798 flat->emplace_back();
799 flat->back().set_opcode(kInstNop);
800 flat->back().set_out(rootmap->get_existing(id));
801 continue;
802 }
803
804 Inst* ip = inst(id);
805 switch (ip->opcode()) {
806 default:
807 LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
808 break;
809
810 case kInstAltMatch:
811 flat->emplace_back();
812 flat->back().set_opcode(kInstAltMatch);
813 flat->back().set_out(static_cast<int>(flat->size()));
814 flat->back().out1_ = static_cast<uint32_t>(flat->size())+1;
815 ABSL_FALLTHROUGH_INTENDED;
816
817 case kInstAlt:
818 stk->push_back(ip->out1());
819 id = ip->out();
820 goto Loop;
821
822 case kInstByteRange:
823 case kInstCapture:
824 case kInstEmptyWidth:
825 flat->emplace_back();
826 memmove(&flat->back(), ip, sizeof *ip);
827 flat->back().set_out(rootmap->get_existing(ip->out()));
828 break;
829
830 case kInstNop:
831 id = ip->out();
832 goto Loop;
833
834 case kInstMatch:
835 case kInstFail:
836 flat->emplace_back();
837 memmove(&flat->back(), ip, sizeof *ip);
838 break;
839 }
840 }
841}
842
843// For each ByteRange instruction in [begin, end), computes a hint to execution
844// engines: the delta to the next instruction (in flat) worth exploring iff the
845// current instruction matched.
846//
847// Implements a coloring algorithm related to ByteMapBuilder, but in this case,
848// colors are instructions and recoloring ranges precisely identifies conflicts
849// between instructions. Iterating backwards over [begin, end) is guaranteed to
850// identify the nearest conflict (if any) with only linear complexity.
851void Prog::ComputeHints(std::vector<Inst>* flat, int begin, int end) {
852 Bitmap256 splits;
853 int colors[256];
854
855 bool dirty = false;
856 for (int id = end; id >= begin; --id) {
857 if (id == end ||
858 (*flat)[id].opcode() != kInstByteRange) {
859 if (dirty) {
860 dirty = false;
861 splits.Clear();
862 }
863 splits.Set(255);
864 colors[255] = id;
865 // At this point, the [0-255] range is colored with id.
866 // Thus, hints cannot point beyond id; and if id == end,
867 // hints that would have pointed to id will be 0 instead.
868 continue;
869 }
870 dirty = true;
871
872 // We recolor the [lo-hi] range with id. Note that first ratchets backwards
873 // from end to the nearest conflict (if any) during recoloring.
874 int first = end;
875 auto Recolor = [&](int lo, int hi) {
876 // Like ByteMapBuilder, we split at lo-1 and at hi.
877 --lo;
878
879 if (0 <= lo && !splits.Test(lo)) {
880 splits.Set(lo);
881 int next = splits.FindNextSetBit(lo+1);
882 colors[lo] = colors[next];
883 }
884 if (!splits.Test(hi)) {
885 splits.Set(hi);
886 int next = splits.FindNextSetBit(hi+1);
887 colors[hi] = colors[next];
888 }
889
890 int c = lo+1;
891 while (c < 256) {
892 int next = splits.FindNextSetBit(c);
893 // Ratchet backwards...
894 first = std::min(first, colors[next]);
895 // Recolor with id - because it's the new nearest conflict!
896 colors[next] = id;
897 if (next == hi)
898 break;
899 c = next+1;
900 }
901 };
902
903 Inst* ip = &(*flat)[id];
904 int lo = ip->lo();
905 int hi = ip->hi();
906 Recolor(lo, hi);
907 if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
908 int foldlo = lo;
909 int foldhi = hi;
910 if (foldlo < 'a')
911 foldlo = 'a';
912 if (foldhi > 'z')
913 foldhi = 'z';
914 if (foldlo <= foldhi) {
915 foldlo += 'A' - 'a';
916 foldhi += 'A' - 'a';
917 Recolor(foldlo, foldhi);
918 }
919 }
920
921 if (first != end) {
922 uint16_t hint = static_cast<uint16_t>(std::min(first - id, 32767));
923 ip->hint_foldcase_ |= hint<<1;
924 }
925 }
926}
927
928// The final state will always be this, which frees up a register for the hot
929// loop and thus avoids the spilling that can occur when building with Clang.
930static const size_t kShiftDFAFinal = 9;
931
932// This function takes the prefix as std::string (i.e. not const std::string&
933// as normal) because it's going to clobber it, so a temporary is convenient.
934static uint64_t* BuildShiftDFA(std::string prefix) {
935 // This constant is for convenience now and also for correctness later when
936 // we clobber the prefix, but still need to know how long it was initially.
937 const size_t size = prefix.size();
938
939 // Construct the NFA.
940 // The table is indexed by input byte; each element is a bitfield of states
941 // reachable by the input byte. Given a bitfield of the current states, the
942 // bitfield of states reachable from those is - for this specific purpose -
943 // always ((ncurr << 1) | 1). Intersecting the reachability bitfields gives
944 // the bitfield of the next states reached by stepping over the input byte.
945 // Credits for this technique: the Hyperscan paper by Geoff Langdale et al.
946 uint16_t nfa[256]{};
947 for (size_t i = 0; i < size; ++i) {
948 uint8_t b = prefix[i];
949 nfa[b] |= 1 << (i+1);
950 }
951 // This is the `\C*?` for unanchored search.
952 for (int b = 0; b < 256; ++b)
953 nfa[b] |= 1;
954
955 // This maps from DFA state to NFA states; the reverse mapping is used when
956 // recording transitions and gets implemented with plain old linear search.
957 // The "Shift DFA" technique limits this to ten states when using uint64_t;
958 // to allow for the initial state, we use at most nine bytes of the prefix.
959 // That same limit is also why uint16_t is sufficient for the NFA bitfield.
960 uint16_t states[kShiftDFAFinal+1]{};
961 states[0] = 1;
962 for (size_t dcurr = 0; dcurr < size; ++dcurr) {
963 uint8_t b = prefix[dcurr];
964 uint16_t ncurr = states[dcurr];
965 uint16_t nnext = nfa[b] & ((ncurr << 1) | 1);
966 size_t dnext = dcurr+1;
967 if (dnext == size)
968 dnext = kShiftDFAFinal;
969 states[dnext] = nnext;
970 }
971
972 // Sort and unique the bytes of the prefix to avoid repeating work while we
973 // record transitions. This clobbers the prefix, but it's no longer needed.
974 std::sort(prefix.begin(), prefix.end());
975 prefix.erase(std::unique(prefix.begin(), prefix.end()), prefix.end());
976
977 // Construct the DFA.
978 // The table is indexed by input byte; each element is effectively a packed
979 // array of uint6_t; each array value will be multiplied by six in order to
980 // avoid having to do so later in the hot loop as well as masking/shifting.
981 // Credits for this technique: "Shift-based DFAs" on GitHub by Per Vognsen.
982 uint64_t* dfa = new uint64_t[256]{};
983 // Record a transition from each state for each of the bytes of the prefix.
984 // Note that all other input bytes go back to the initial state by default.
985 for (size_t dcurr = 0; dcurr < size; ++dcurr) {
986 for (uint8_t b : prefix) {
987 uint16_t ncurr = states[dcurr];
988 uint16_t nnext = nfa[b] & ((ncurr << 1) | 1);
989 size_t dnext = 0;
990 while (states[dnext] != nnext)
991 ++dnext;
992 dfa[b] |= static_cast<uint64_t>(dnext * 6) << (dcurr * 6);
993 // Convert ASCII letters to uppercase and record the extra transitions.
994 // Note that ASCII letters are guaranteed to be lowercase at this point
995 // because that's how the parser normalises them. #FunFact: 'k' and 's'
996 // match U+212A and U+017F, respectively, so they won't occur here when
997 // using UTF-8 encoding because the parser will emit character classes.
998 if ('a' <= b && b <= 'z') {
999 b -= 'a' - 'A';
1000 dfa[b] |= static_cast<uint64_t>(dnext * 6) << (dcurr * 6);
1001 }
1002 }
1003 }
1004 // This lets the final state "saturate", which will matter for performance:
1005 // in the hot loop, we check for a match only at the end of each iteration,
1006 // so we must keep signalling the match until we get around to checking it.
1007 for (int b = 0; b < 256; ++b)
1008 dfa[b] |= static_cast<uint64_t>(kShiftDFAFinal * 6) << (kShiftDFAFinal * 6);
1009
1010 return dfa;
1011}
1012
1013void Prog::ConfigurePrefixAccel(const std::string& prefix,
1014 bool prefix_foldcase) {
1015 prefix_foldcase_ = prefix_foldcase;
1016 prefix_size_ = prefix.size();
1017 if (prefix_foldcase_) {
1018 // Use PrefixAccel_ShiftDFA().
1019 // ... and no more than nine bytes of the prefix. (See above for details.)
1020 prefix_size_ = std::min(prefix_size_, kShiftDFAFinal);
1021 prefix_dfa_ = BuildShiftDFA(prefix.substr(0, prefix_size_));
1022 } else if (prefix_size_ != 1) {
1023 // Use PrefixAccel_FrontAndBack().
1024 prefix_front_ = prefix.front();
1025 prefix_back_ = prefix.back();
1026 } else {
1027 // Use memchr(3).
1028 prefix_front_ = prefix.front();
1029 }
1030}
1031
1032const void* Prog::PrefixAccel_ShiftDFA(const void* data, size_t size) {
1033 if (size < prefix_size_)
1034 return NULL;
1035
1036 uint64_t curr = 0;
1037
1038 // At the time of writing, rough benchmarks on a Broadwell machine showed
1039 // that this unroll factor (i.e. eight) achieves a speedup factor of two.
1040 if (size >= 8) {
1041 const uint8_t* p = reinterpret_cast<const uint8_t*>(data);
1042 const uint8_t* endp = p + (size&~7);
1043 do {
1044 uint8_t b0 = p[0];
1045 uint8_t b1 = p[1];
1046 uint8_t b2 = p[2];
1047 uint8_t b3 = p[3];
1048 uint8_t b4 = p[4];
1049 uint8_t b5 = p[5];
1050 uint8_t b6 = p[6];
1051 uint8_t b7 = p[7];
1052
1053 uint64_t next0 = prefix_dfa_[b0];
1054 uint64_t next1 = prefix_dfa_[b1];
1055 uint64_t next2 = prefix_dfa_[b2];
1056 uint64_t next3 = prefix_dfa_[b3];
1057 uint64_t next4 = prefix_dfa_[b4];
1058 uint64_t next5 = prefix_dfa_[b5];
1059 uint64_t next6 = prefix_dfa_[b6];
1060 uint64_t next7 = prefix_dfa_[b7];
1061
1062 uint64_t curr0 = next0 >> (curr & 63);
1063 uint64_t curr1 = next1 >> (curr0 & 63);
1064 uint64_t curr2 = next2 >> (curr1 & 63);
1065 uint64_t curr3 = next3 >> (curr2 & 63);
1066 uint64_t curr4 = next4 >> (curr3 & 63);
1067 uint64_t curr5 = next5 >> (curr4 & 63);
1068 uint64_t curr6 = next6 >> (curr5 & 63);
1069 uint64_t curr7 = next7 >> (curr6 & 63);
1070
1071 if ((curr7 & 63) == kShiftDFAFinal * 6) {
1072 // At the time of writing, using the same masking subexpressions from
1073 // the preceding lines caused Clang to clutter the hot loop computing
1074 // them - even though they aren't actually needed for shifting! Hence
1075 // these rewritten conditions, which achieve a speedup factor of two.
1076 if (((curr7-curr0) & 63) == 0) return p+1-prefix_size_;
1077 if (((curr7-curr1) & 63) == 0) return p+2-prefix_size_;
1078 if (((curr7-curr2) & 63) == 0) return p+3-prefix_size_;
1079 if (((curr7-curr3) & 63) == 0) return p+4-prefix_size_;
1080 if (((curr7-curr4) & 63) == 0) return p+5-prefix_size_;
1081 if (((curr7-curr5) & 63) == 0) return p+6-prefix_size_;
1082 if (((curr7-curr6) & 63) == 0) return p+7-prefix_size_;
1083 if (((curr7-curr7) & 63) == 0) return p+8-prefix_size_;
1084 }
1085
1086 curr = curr7;
1087 p += 8;
1088 } while (p != endp);
1089 data = p;
1090 size = size&7;
1091 }
1092
1093 const uint8_t* p = reinterpret_cast<const uint8_t*>(data);
1094 const uint8_t* endp = p + size;
1095 while (p != endp) {
1096 uint8_t b = *p++;
1097 uint64_t next = prefix_dfa_[b];
1098 curr = next >> (curr & 63);
1099 if ((curr & 63) == kShiftDFAFinal * 6)
1100 return p-prefix_size_;
1101 }
1102 return NULL;
1103}
1104
1105#if defined(__AVX2__)
1106// Finds the least significant non-zero bit in n.
1107static int FindLSBSet(uint32_t n) {
1108 DCHECK_NE(n, 0);
1109#if defined(__GNUC__)
1110 return __builtin_ctz(n);
1111#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
1112 unsigned long c;
1113 _BitScanForward(&c, n);
1114 return static_cast<int>(c);
1115#else
1116 int c = 31;
1117 for (int shift = 1 << 4; shift != 0; shift >>= 1) {
1118 uint32_t word = n << shift;
1119 if (word != 0) {
1120 n = word;
1121 c -= shift;
1122 }
1123 }
1124 return c;
1125#endif
1126}
1127#endif
1128
1129const void* Prog::PrefixAccel_FrontAndBack(const void* data, size_t size) {
1130 DCHECK_GE(prefix_size_, 2);
1131 if (size < prefix_size_)
1132 return NULL;
1133 // Don't bother searching the last prefix_size_-1 bytes for prefix_front_.
1134 // This also means that probing for prefix_back_ doesn't go out of bounds.
1135 size -= prefix_size_-1;
1136
1137#if defined(__AVX2__)
1138 // Use AVX2 to look for prefix_front_ and prefix_back_ 32 bytes at a time.
1139 if (size >= sizeof(__m256i)) {
1140 const __m256i* fp = reinterpret_cast<const __m256i*>(
1141 reinterpret_cast<const char*>(data));
1142 const __m256i* bp = reinterpret_cast<const __m256i*>(
1143 reinterpret_cast<const char*>(data) + prefix_size_-1);
1144 const __m256i* endfp = fp + size/sizeof(__m256i);
1145 const __m256i f_set1 = _mm256_set1_epi8(prefix_front_);
1146 const __m256i b_set1 = _mm256_set1_epi8(prefix_back_);
1147 do {
1148 const __m256i f_loadu = _mm256_loadu_si256(fp++);
1149 const __m256i b_loadu = _mm256_loadu_si256(bp++);
1150 const __m256i f_cmpeq = _mm256_cmpeq_epi8(f_set1, f_loadu);
1151 const __m256i b_cmpeq = _mm256_cmpeq_epi8(b_set1, b_loadu);
1152 const int fb_testz = _mm256_testz_si256(f_cmpeq, b_cmpeq);
1153 if (fb_testz == 0) { // ZF: 1 means zero, 0 means non-zero.
1154 const __m256i fb_and = _mm256_and_si256(f_cmpeq, b_cmpeq);
1155 const int fb_movemask = _mm256_movemask_epi8(fb_and);
1156 const int fb_ctz = FindLSBSet(fb_movemask);
1157 return reinterpret_cast<const char*>(fp-1) + fb_ctz;
1158 }
1159 } while (fp != endfp);
1160 data = fp;
1161 size = size%sizeof(__m256i);
1162 }
1163#endif
1164
1165 const char* p0 = reinterpret_cast<const char*>(data);
1166 for (const char* p = p0;; p++) {
1167 DCHECK_GE(size, static_cast<size_t>(p-p0));
1168 p = reinterpret_cast<const char*>(memchr(p, prefix_front_, size - (p-p0)));
1169 if (p == NULL || p[prefix_size_-1] == prefix_back_)
1170 return p;
1171 }
1172}
1173
1174} // namespace re2
1175