1// Copyright 2007 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#ifndef RE2_PROG_H_
6#define RE2_PROG_H_
7
8// Compiled representation of regular expressions.
9// See regexp.h for the Regexp class, which represents a regular
10// expression symbolically.
11
12#include <stdint.h>
13#include <functional>
14#include <string>
15#include <vector>
16#include <type_traits>
17
18#include "absl/base/call_once.h"
19#include "absl/strings/string_view.h"
20#include "util/logging.h"
21#include "re2/pod_array.h"
22#include "re2/re2.h"
23#include "re2/sparse_array.h"
24#include "re2/sparse_set.h"
25
26namespace re2 {
27
28// Opcodes for Inst
29enum InstOp {
30 kInstAlt = 0, // choose between out_ and out1_
31 kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
32 kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_]
33 kInstCapture, // capturing parenthesis number cap_
34 kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_
35 kInstMatch, // found a match!
36 kInstNop, // no-op; occasionally unavoidable
37 kInstFail, // never match; occasionally unavoidable
38 kNumInst,
39};
40
41// Bit flags for empty-width specials
42enum EmptyOp {
43 kEmptyBeginLine = 1<<0, // ^ - beginning of line
44 kEmptyEndLine = 1<<1, // $ - end of line
45 kEmptyBeginText = 1<<2, // \A - beginning of text
46 kEmptyEndText = 1<<3, // \z - end of text
47 kEmptyWordBoundary = 1<<4, // \b - word boundary
48 kEmptyNonWordBoundary = 1<<5, // \B - not \b
49 kEmptyAllFlags = (1<<6)-1,
50};
51
52class DFA;
53class Regexp;
54
55// Compiled form of regexp program.
56class Prog {
57 public:
58 Prog();
59 ~Prog();
60
61 // Single instruction in regexp program.
62 class Inst {
63 public:
64 // See the assertion below for why this is so.
65 Inst() = default;
66
67 // Copyable.
68 Inst(const Inst&) = default;
69 Inst& operator=(const Inst&) = default;
70
71 // Constructors per opcode
72 void InitAlt(uint32_t out, uint32_t out1);
73 void InitByteRange(int lo, int hi, int foldcase, uint32_t out);
74 void InitCapture(int cap, uint32_t out);
75 void InitEmptyWidth(EmptyOp empty, uint32_t out);
76 void InitMatch(int id);
77 void InitNop(uint32_t out);
78 void InitFail();
79
80 // Getters
81 int id(Prog* p) { return static_cast<int>(this - p->inst_.data()); }
82 InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
83 int last() { return (out_opcode_>>3)&1; }
84 int out() { return out_opcode_>>4; }
85 int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
86 int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
87 int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
88 int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
89 int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_&1; }
90 int hint() { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_>>1; }
91 int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
92 EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
93
94 bool greedy(Prog* p) {
95 DCHECK_EQ(opcode(), kInstAltMatch);
96 return p->inst(out())->opcode() == kInstByteRange ||
97 (p->inst(out())->opcode() == kInstNop &&
98 p->inst(p->inst(out())->out())->opcode() == kInstByteRange);
99 }
100
101 // Does this inst (an kInstByteRange) match c?
102 inline bool Matches(int c) {
103 DCHECK_EQ(opcode(), kInstByteRange);
104 if (foldcase() && 'A' <= c && c <= 'Z')
105 c += 'a' - 'A';
106 return lo_ <= c && c <= hi_;
107 }
108
109 // Returns string representation for debugging.
110 std::string Dump();
111
112 // Maximum instruction id.
113 // (Must fit in out_opcode_. PatchList/last steal another bit.)
114 static const int kMaxInst = (1<<28) - 1;
115
116 private:
117 void set_opcode(InstOp opcode) {
118 out_opcode_ = (out()<<4) | (last()<<3) | opcode;
119 }
120
121 void set_last() {
122 out_opcode_ = (out()<<4) | (1<<3) | opcode();
123 }
124
125 void set_out(int out) {
126 out_opcode_ = (out<<4) | (last()<<3) | opcode();
127 }
128
129 void set_out_opcode(int out, InstOp opcode) {
130 out_opcode_ = (out<<4) | (last()<<3) | opcode;
131 }
132
133 uint32_t out_opcode_; // 28 bits: out, 1 bit: last, 3 (low) bits: opcode
134 union { // additional instruction arguments:
135 uint32_t out1_; // opcode == kInstAlt
136 // alternate next instruction
137
138 int32_t cap_; // opcode == kInstCapture
139 // Index of capture register (holds text
140 // position recorded by capturing parentheses).
141 // For \n (the submatch for the nth parentheses),
142 // the left parenthesis captures into register 2*n
143 // and the right one captures into register 2*n+1.
144
145 int32_t match_id_; // opcode == kInstMatch
146 // Match ID to identify this match (for re2::Set).
147
148 struct { // opcode == kInstByteRange
149 uint8_t lo_; // byte range is lo_-hi_ inclusive
150 uint8_t hi_; //
151 uint16_t hint_foldcase_; // 15 bits: hint, 1 (low) bit: foldcase
152 // hint to execution engines: the delta to the
153 // next instruction (in the current list) worth
154 // exploring iff this instruction matched; 0
155 // means there are no remaining possibilities,
156 // which is most likely for character classes.
157 // foldcase: A-Z -> a-z before checking range.
158 };
159
160 EmptyOp empty_; // opcode == kInstEmptyWidth
161 // empty_ is bitwise OR of kEmpty* flags above.
162 };
163
164 friend class Compiler;
165 friend struct PatchList;
166 friend class Prog;
167 };
168
169 // Inst must be trivial so that we can freely clear it with memset(3).
170 // Arrays of Inst are initialised by copying the initial elements with
171 // memmove(3) and then clearing any remaining elements with memset(3).
172 static_assert(std::is_trivial<Inst>::value, "Inst must be trivial");
173
174 // Whether to anchor the search.
175 enum Anchor {
176 kUnanchored, // match anywhere
177 kAnchored, // match only starting at beginning of text
178 };
179
180 // Kind of match to look for (for anchor != kFullMatch)
181 //
182 // kLongestMatch mode finds the overall longest
183 // match but still makes its submatch choices the way
184 // Perl would, not in the way prescribed by POSIX.
185 // The POSIX rules are much more expensive to implement,
186 // and no one has needed them.
187 //
188 // kFullMatch is not strictly necessary -- we could use
189 // kLongestMatch and then check the length of the match -- but
190 // the matching code can run faster if it knows to consider only
191 // full matches.
192 enum MatchKind {
193 kFirstMatch, // like Perl, PCRE
194 kLongestMatch, // like egrep or POSIX
195 kFullMatch, // match only entire text; implies anchor==kAnchored
196 kManyMatch // for SearchDFA, records set of matches
197 };
198
199 Inst *inst(int id) { return &inst_[id]; }
200 int start() { return start_; }
201 void set_start(int start) { start_ = start; }
202 int start_unanchored() { return start_unanchored_; }
203 void set_start_unanchored(int start) { start_unanchored_ = start; }
204 int size() { return size_; }
205 bool reversed() { return reversed_; }
206 void set_reversed(bool reversed) { reversed_ = reversed; }
207 int list_count() { return list_count_; }
208 int inst_count(InstOp op) { return inst_count_[op]; }
209 uint16_t* list_heads() { return list_heads_.data(); }
210 size_t bit_state_text_max_size() { return bit_state_text_max_size_; }
211 int64_t dfa_mem() { return dfa_mem_; }
212 void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
213 bool anchor_start() { return anchor_start_; }
214 void set_anchor_start(bool b) { anchor_start_ = b; }
215 bool anchor_end() { return anchor_end_; }
216 void set_anchor_end(bool b) { anchor_end_ = b; }
217 int bytemap_range() { return bytemap_range_; }
218 const uint8_t* bytemap() { return bytemap_; }
219 bool can_prefix_accel() { return prefix_size_ != 0; }
220
221 // Accelerates to the first likely occurrence of the prefix.
222 // Returns a pointer to the first byte or NULL if not found.
223 const void* PrefixAccel(const void* data, size_t size) {
224 DCHECK(can_prefix_accel());
225 if (prefix_foldcase_) {
226 return PrefixAccel_ShiftDFA(data, size);
227 } else if (prefix_size_ != 1) {
228 return PrefixAccel_FrontAndBack(data, size);
229 } else {
230 return memchr(data, prefix_front_, size);
231 }
232 }
233
234 // Configures prefix accel using the analysis performed during compilation.
235 void ConfigurePrefixAccel(const std::string& prefix, bool prefix_foldcase);
236
237 // An implementation of prefix accel that uses prefix_dfa_ to perform
238 // case-insensitive search.
239 const void* PrefixAccel_ShiftDFA(const void* data, size_t size);
240
241 // An implementation of prefix accel that looks for prefix_front_ and
242 // prefix_back_ to return fewer false positives than memchr(3) alone.
243 const void* PrefixAccel_FrontAndBack(const void* data, size_t size);
244
245 // Returns string representation of program for debugging.
246 std::string Dump();
247 std::string DumpUnanchored();
248 std::string DumpByteMap();
249
250 // Returns the set of kEmpty flags that are in effect at
251 // position p within context.
252 static uint32_t EmptyFlags(absl::string_view context, const char* p);
253
254 // Returns whether byte c is a word character: ASCII only.
255 // Used by the implementation of \b and \B.
256 // This is not right for Unicode, but:
257 // - it's hard to get right in a byte-at-a-time matching world
258 // (the DFA has only one-byte lookahead).
259 // - even if the lookahead were possible, the Progs would be huge.
260 // This crude approximation is the same one PCRE uses.
261 static bool IsWordChar(uint8_t c) {
262 return ('A' <= c && c <= 'Z') ||
263 ('a' <= c && c <= 'z') ||
264 ('0' <= c && c <= '9') ||
265 c == '_';
266 }
267
268 // Execution engines. They all search for the regexp (run the prog)
269 // in text, which is in the larger context (used for ^ $ \b etc).
270 // Anchor and kind control the kind of search.
271 // Returns true if match found, false if not.
272 // If match found, fills match[0..nmatch-1] with submatch info.
273 // match[0] is overall match, match[1] is first set of parens, etc.
274 // If a particular submatch is not matched during the regexp match,
275 // it is set to NULL.
276 //
277 // Matching text == absl::string_view() is treated as any other empty
278 // string, but note that on return, it will not be possible to distinguish
279 // submatches that matched that empty string from submatches that didn't
280 // match anything. Either way, match[i] == NULL.
281
282 // Search using NFA: can find submatches but kind of slow.
283 bool SearchNFA(absl::string_view text, absl::string_view context,
284 Anchor anchor, MatchKind kind, absl::string_view* match,
285 int nmatch);
286
287 // Search using DFA: much faster than NFA but only finds
288 // end of match and can use a lot more memory.
289 // Returns whether a match was found.
290 // If the DFA runs out of memory, sets *failed to true and returns false.
291 // If matches != NULL and kind == kManyMatch and there is a match,
292 // SearchDFA fills matches with the match IDs of the final matching state.
293 bool SearchDFA(absl::string_view text, absl::string_view context,
294 Anchor anchor, MatchKind kind, absl::string_view* match0,
295 bool* failed, SparseSet* matches);
296
297 // The callback issued after building each DFA state with BuildEntireDFA().
298 // If next is null, then the memory budget has been exhausted and building
299 // will halt. Otherwise, the state has been built and next points to an array
300 // of bytemap_range()+1 slots holding the next states as per the bytemap and
301 // kByteEndText. The number of the state is implied by the callback sequence:
302 // the first callback is for state 0, the second callback is for state 1, ...
303 // match indicates whether the state is a matching state.
304 using DFAStateCallback = std::function<void(const int* next, bool match)>;
305
306 // Build the entire DFA for the given match kind.
307 // Usually the DFA is built out incrementally, as needed, which
308 // avoids lots of unnecessary work.
309 // If cb is not empty, it receives one callback per state built.
310 // Returns the number of states built.
311 // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY.
312 int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb);
313
314 // Compute bytemap.
315 void ComputeByteMap();
316
317 // Run peep-hole optimizer on program.
318 void Optimize();
319
320 // One-pass NFA: only correct if IsOnePass() is true,
321 // but much faster than NFA (competitive with PCRE)
322 // for those expressions.
323 bool IsOnePass();
324 bool SearchOnePass(absl::string_view text, absl::string_view context,
325 Anchor anchor, MatchKind kind, absl::string_view* match,
326 int nmatch);
327
328 // Bit-state backtracking. Fast on small cases but uses memory
329 // proportional to the product of the list count and the text size.
330 bool CanBitState() { return list_heads_.data() != NULL; }
331 bool SearchBitState(absl::string_view text, absl::string_view context,
332 Anchor anchor, MatchKind kind, absl::string_view* match,
333 int nmatch);
334
335 static const int kMaxOnePassCapture = 5; // $0 through $4
336
337 // Backtracking search: the gold standard against which the other
338 // implementations are checked. FOR TESTING ONLY.
339 // It allocates a ton of memory to avoid running forever.
340 // It is also recursive, so can't use in production (will overflow stacks).
341 // The name "Unsafe" here is supposed to be a flag that
342 // you should not be using this function.
343 bool UnsafeSearchBacktrack(absl::string_view text, absl::string_view context,
344 Anchor anchor, MatchKind kind,
345 absl::string_view* match, int nmatch);
346
347 // Computes range for any strings matching regexp. The min and max can in
348 // some cases be arbitrarily precise, so the caller gets to specify the
349 // maximum desired length of string returned.
350 //
351 // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
352 // string s that is an anchored match for this regexp satisfies
353 // min <= s && s <= max.
354 //
355 // Note that PossibleMatchRange() will only consider the first copy of an
356 // infinitely repeated element (i.e., any regexp element followed by a '*' or
357 // '+' operator). Regexps with "{N}" constructions are not affected, as those
358 // do not compile down to infinite repetitions.
359 //
360 // Returns true on success, false on error.
361 bool PossibleMatchRange(std::string* min, std::string* max, int maxlen);
362
363 // Outputs the program fanout into the given sparse array.
364 void Fanout(SparseArray<int>* fanout);
365
366 // Compiles a collection of regexps to Prog. Each regexp will have
367 // its own Match instruction recording the index in the output vector.
368 static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
369
370 // Flattens the Prog from "tree" form to "list" form. This is an in-place
371 // operation in the sense that the old instructions are lost.
372 void Flatten();
373
374 // Walks the Prog; the "successor roots" or predecessors of the reachable
375 // instructions are marked in rootmap or predmap/predvec, respectively.
376 // reachable and stk are preallocated scratch structures.
377 void MarkSuccessors(SparseArray<int>* rootmap,
378 SparseArray<int>* predmap,
379 std::vector<std::vector<int>>* predvec,
380 SparseSet* reachable, std::vector<int>* stk);
381
382 // Walks the Prog from the given "root" instruction; the "dominator root"
383 // of the reachable instructions (if such exists) is marked in rootmap.
384 // reachable and stk are preallocated scratch structures.
385 void MarkDominator(int root, SparseArray<int>* rootmap,
386 SparseArray<int>* predmap,
387 std::vector<std::vector<int>>* predvec,
388 SparseSet* reachable, std::vector<int>* stk);
389
390 // Walks the Prog from the given "root" instruction; the reachable
391 // instructions are emitted in "list" form and appended to flat.
392 // reachable and stk are preallocated scratch structures.
393 void EmitList(int root, SparseArray<int>* rootmap,
394 std::vector<Inst>* flat,
395 SparseSet* reachable, std::vector<int>* stk);
396
397 // Computes hints for ByteRange instructions in [begin, end).
398 void ComputeHints(std::vector<Inst>* flat, int begin, int end);
399
400 // Controls whether the DFA should bail out early if the NFA would be faster.
401 // FOR TESTING ONLY.
402 static void TESTING_ONLY_set_dfa_should_bail_when_slow(bool b);
403
404 private:
405 friend class Compiler;
406
407 DFA* GetDFA(MatchKind kind);
408 void DeleteDFA(DFA* dfa);
409
410 bool anchor_start_; // regexp has explicit start anchor
411 bool anchor_end_; // regexp has explicit end anchor
412 bool reversed_; // whether program runs backward over input
413 bool did_flatten_; // has Flatten been called?
414 bool did_onepass_; // has IsOnePass been called?
415
416 int start_; // entry point for program
417 int start_unanchored_; // unanchored entry point for program
418 int size_; // number of instructions
419 int bytemap_range_; // bytemap_[x] < bytemap_range_
420
421 bool prefix_foldcase_; // whether prefix is case-insensitive
422 size_t prefix_size_; // size of prefix (0 if no prefix)
423 union {
424 uint64_t* prefix_dfa_; // "Shift DFA" for prefix
425 struct {
426 int prefix_front_; // first byte of prefix
427 int prefix_back_; // last byte of prefix
428 };
429 };
430
431 int list_count_; // count of lists (see above)
432 int inst_count_[kNumInst]; // count of instructions by opcode
433 PODArray<uint16_t> list_heads_; // sparse array enumerating list heads
434 // not populated if size_ is overly large
435 size_t bit_state_text_max_size_; // upper bound (inclusive) on text.size()
436
437 PODArray<Inst> inst_; // pointer to instruction array
438 PODArray<uint8_t> onepass_nodes_; // data for OnePass nodes
439
440 int64_t dfa_mem_; // Maximum memory for DFAs.
441 DFA* dfa_first_; // DFA cached for kFirstMatch/kManyMatch
442 DFA* dfa_longest_; // DFA cached for kLongestMatch/kFullMatch
443
444 uint8_t bytemap_[256]; // map from input bytes to byte classes
445
446 absl::once_flag dfa_first_once_;
447 absl::once_flag dfa_longest_once_;
448
449 Prog(const Prog&) = delete;
450 Prog& operator=(const Prog&) = delete;
451};
452
453// std::string_view in MSVC has iterators that aren't just pointers and
454// that don't allow comparisons between different objects - not even if
455// those objects are views into the same string! Thus, we provide these
456// conversion functions for convenience.
457static inline const char* BeginPtr(absl::string_view s) {
458 return s.data();
459}
460static inline const char* EndPtr(absl::string_view s) {
461 return s.data() + s.size();
462}
463
464} // namespace re2
465
466#endif // RE2_PROG_H_
467