prog.h source code [tensorflow/external/com_googlesource_code_re2/re2/prog.h]

1	// Copyright 2007 The RE2 Authors. All Rights Reserved.
2	// Use of this source code is governed by a BSD-style
3	// license that can be found in the LICENSE file.
4
5	#ifndef RE2_PROG_H_
6	#define RE2_PROG_H_
7
8	// Compiled representation of regular expressions.
9	// See regexp.h for the Regexp class, which represents a regular
10	// expression symbolically.
11
12	#include <stdint.h>
13	#include <functional>
14	#include <string>
15	#include <vector>
16	#include <type_traits>
17
18	#include "absl/base/call_once.h"
19	#include "absl/strings/string_view.h"
20	#include "util/logging.h"
21	#include "re2/pod_array.h"
22	#include "re2/re2.h"
23	#include "re2/sparse_array.h"
24	#include "re2/sparse_set.h"
25
26	namespace re2 {
27
28	// Opcodes for Inst
29	enum InstOp {
30	kInstAlt = `0`, // choose between out_ and out1_
31	kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
32	kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_]
33	kInstCapture, // capturing parenthesis number cap_
34	kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_
35	kInstMatch, // found a match!
36	kInstNop, // no-op; occasionally unavoidable
37	kInstFail, // never match; occasionally unavoidable
38	kNumInst,
39	};
40
41	// Bit flags for empty-width specials
42	enum EmptyOp {
43	kEmptyBeginLine = `1`<<`0`, // ^ - beginning of line
44	kEmptyEndLine = `1`<<`1`, // $ - end of line
45	kEmptyBeginText = `1`<<`2`, // \A - beginning of text
46	kEmptyEndText = `1`<<`3`, // \z - end of text
47	kEmptyWordBoundary = `1`<<`4`, // \b - word boundary
48	kEmptyNonWordBoundary = `1`<<`5`, // \B - not \b
49	kEmptyAllFlags = (`1`<<`6`)-`1`,
50	};
51
52	class DFA;
53	class Regexp;
54
55	// Compiled form of regexp program.
56	class Prog {
57	public:
58	Prog();
59	~Prog();
60
61	// Single instruction in regexp program.
62	class Inst {
63	public:
64	// See the assertion below for why this is so.
65	Inst() = default;
66
67	// Copyable.
68	Inst(const Inst&) = default;
69	Inst& operator=(const Inst&) = default;
70
71	// Constructors per opcode
72	void InitAlt(uint32_t out, uint32_t out1);
73	void InitByteRange(int lo, int hi, int foldcase, uint32_t out);
74	void InitCapture(int cap, uint32_t out);
75	void InitEmptyWidth(EmptyOp empty, uint32_t out);
76	void InitMatch(int id);
77	void InitNop(uint32_t out);
78	void InitFail();
79
80	// Getters
81	int id(Prog* p) { return static_cast<int>(this - p->inst_.data()); }
82	InstOp opcode() { return static_cast<InstOp>(out_opcode_&`7`); }
83	int last() { return (out_opcode_>>`3`)&`1`; }
84	int out() { return out_opcode_>>`4`; }
85	int out1() { DCHECK(opcode() == kInstAlt \|\| opcode() == kInstAltMatch); return out1_; }
86	int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
87	int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
88	int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
89	int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_&`1`; }
90	int hint() { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_>>`1`; }
91	int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
92	EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
93
94	bool greedy(Prog* p) {
95	DCHECK_EQ(opcode(), kInstAltMatch);
96	return p->inst(out())->opcode() == kInstByteRange \|\|
97	(p->inst(out())->opcode() == kInstNop &&
98	p->inst(p->inst(out())->out())->opcode() == kInstByteRange);
99	}
100
101	// Does this inst (an kInstByteRange) match c?
102	inline bool Matches(int c) {
103	DCHECK_EQ(opcode(), kInstByteRange);
104	if (foldcase() && `'A'` <= c && c <= `'Z'`)
105	c += `'a'` - `'A'`;
106	return lo_ <= c && c <= hi_;
107	}
108
109	// Returns string representation for debugging.
110	std::string Dump();
111
112	// Maximum instruction id.
113	// (Must fit in out_opcode_. PatchList/last steal another bit.)
114	static const int kMaxInst = (`1`<<`28`) - `1`;
115
116	private:
117	void set_opcode(InstOp opcode) {
118	out_opcode_ = (out()<<`4`) \| (last()<<`3`) \| opcode;
119	}
120
121	void set_last() {
122	out_opcode_ = (out()<<`4`) \| (`1`<<`3`) \| opcode();
123	}
124
125	void set_out(int out) {
126	out_opcode_ = (out<<`4`) \| (last()<<`3`) \| opcode();
127	}
128
129	void set_out_opcode(int out, InstOp opcode) {
130	out_opcode_ = (out<<`4`) \| (last()<<`3`) \| opcode;
131	}
132
133	uint32_t out_opcode_; // 28 bits: out, 1 bit: last, 3 (low) bits: opcode
134	union { // additional instruction arguments:
135	uint32_t out1_; // opcode == kInstAlt
136	// alternate next instruction
137
138	int32_t cap_; // opcode == kInstCapture
139	// Index of capture register (holds text
140	// position recorded by capturing parentheses).
141	// For \n (the submatch for the nth parentheses),
142	// the left parenthesis captures into register 2n*
143	// and the right one captures into register 2n+1.*
144
145	int32_t match_id_; // opcode == kInstMatch
146	// Match ID to identify this match (for re2::Set).
147
148	struct { // opcode == kInstByteRange
149	uint8_t lo_; // byte range is lo_-hi_ inclusive
150	uint8_t hi_; //
151	uint16_t hint_foldcase_; // 15 bits: hint, 1 (low) bit: foldcase
152	// hint to execution engines: the delta to the
153	// next instruction (in the current list) worth
154	// exploring iff this instruction matched; 0
155	// means there are no remaining possibilities,
156	// which is most likely for character classes.
157	// foldcase: A-Z -> a-z before checking range.
158	};
159
160	EmptyOp empty_; // opcode == kInstEmptyWidth
161	// empty_ is bitwise OR of kEmpty flags above.*
162	};
163
164	friend class Compiler;
165	friend struct PatchList;
166	friend class Prog;
167	};
168
169	// Inst must be trivial so that we can freely clear it with memset(3).
170	// Arrays of Inst are initialised by copying the initial elements with
171	// memmove(3) and then clearing any remaining elements with memset(3).
172	static_assert(std::is_trivial<Inst>::value, "Inst must be trivial");
173
174	// Whether to anchor the search.
175	enum Anchor {
176	kUnanchored, // match anywhere
177	kAnchored, // match only starting at beginning of text
178	};
179
180	// Kind of match to look for (for anchor != kFullMatch)
181	//
182	// kLongestMatch mode finds the overall longest
183	// match but still makes its submatch choices the way
184	// Perl would, not in the way prescribed by POSIX.
185	// The POSIX rules are much more expensive to implement,
186	// and no one has needed them.
187	//
188	// kFullMatch is not strictly necessary -- we could use
189	// kLongestMatch and then check the length of the match -- but
190	// the matching code can run faster if it knows to consider only
191	// full matches.
192	enum MatchKind {
193	kFirstMatch, // like Perl, PCRE
194	kLongestMatch, // like egrep or POSIX
195	kFullMatch, // match only entire text; implies anchor==kAnchored
196	kManyMatch // for SearchDFA, records set of matches
197	};
198
199	Inst inst(int* id) { return &inst_[id]; }
200	int start() { return start_; }
201	void set_start(int start) { start_ = start; }
202	int start_unanchored() { return start_unanchored_; }
203	void set_start_unanchored(int start) { start_unanchored_ = start; }
204	int size() { return size_; }
205	bool reversed() { return reversed_; }
206	void set_reversed(bool reversed) { reversed_ = reversed; }
207	int list_count() { return list_count_; }
208	int inst_count(InstOp op) { return inst_count_[op]; }
209	uint16_t* list_heads() { return list_heads_.data(); }
210	size_t bit_state_text_max_size() { return bit_state_text_max_size_; }
211	int64_t dfa_mem() { return dfa_mem_; }
212	void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
213	bool anchor_start() { return anchor_start_; }
214	void set_anchor_start(bool b) { anchor_start_ = b; }
215	bool anchor_end() { return anchor_end_; }
216	void set_anchor_end(bool b) { anchor_end_ = b; }
217	int bytemap_range() { return bytemap_range_; }
218	const uint8_t* bytemap() { return bytemap_; }
219	bool can_prefix_accel() { return prefix_size_ != `0`; }
220
221	// Accelerates to the first likely occurrence of the prefix.
222	// Returns a pointer to the first byte or NULL if not found.
223	const void* PrefixAccel(const void* data, size_t size) {
224	DCHECK(can_prefix_accel());
225	if (prefix_foldcase_) {
226	return PrefixAccel_ShiftDFA(data, size);
227	} else if (prefix_size_ != `1`) {
228	return PrefixAccel_FrontAndBack(data, size);
229	} else {
230	return memchr(data, prefix_front_, size);
231	}
232	}
233
234	// Configures prefix accel using the analysis performed during compilation.
235	void ConfigurePrefixAccel(const std::string& prefix, bool prefix_foldcase);
236
237	// An implementation of prefix accel that uses prefix_dfa_ to perform
238	// case-insensitive search.
239	const void* PrefixAccel_ShiftDFA(const void* data, size_t size);
240
241	// An implementation of prefix accel that looks for prefix_front_ and
242	// prefix_back_ to return fewer false positives than memchr(3) alone.
243	const void* PrefixAccel_FrontAndBack(const void* data, size_t size);
244
245	// Returns string representation of program for debugging.
246	std::string Dump();
247	std::string DumpUnanchored();
248	std::string DumpByteMap();
249
250	// Returns the set of kEmpty flags that are in effect at
251	// position p within context.
252	static uint32_t EmptyFlags(absl::string_view context, const char* p);
253
254	// Returns whether byte c is a word character: ASCII only.
255	// Used by the implementation of \b and \B.
256	// This is not right for Unicode, but:
257	// - it's hard to get right in a byte-at-a-time matching world
258	// (the DFA has only one-byte lookahead).
259	// - even if the lookahead were possible, the Progs would be huge.
260	// This crude approximation is the same one PCRE uses.
261	static bool IsWordChar(uint8_t c) {
262	return (`'A'` <= c && c <= `'Z'`) \|\|
263	(`'a'` <= c && c <= `'z'`) \|\|
264	(`'0'` <= c && c <= `'9'`) \|\|
265	c == `'_'`;
266	}
267
268	// Execution engines. They all search for the regexp (run the prog)
269	// in text, which is in the larger context (used for ^ $ \b etc).
270	// Anchor and kind control the kind of search.
271	// Returns true if match found, false if not.
272	// If match found, fills match[0..nmatch-1] with submatch info.
273	// match[0] is overall match, match[1] is first set of parens, etc.
274	// If a particular submatch is not matched during the regexp match,
275	// it is set to NULL.
276	//
277	// Matching text == absl::string_view() is treated as any other empty
278	// string, but note that on return, it will not be possible to distinguish
279	// submatches that matched that empty string from submatches that didn't
280	// match anything. Either way, match[i] == NULL.
281
282	// Search using NFA: can find submatches but kind of slow.
283	bool SearchNFA(absl::string_view text, absl::string_view context,
284	Anchor anchor, MatchKind kind, absl::string_view* match,
285	int nmatch);
286
287	// Search using DFA: much faster than NFA but only finds
288	// end of match and can use a lot more memory.
289	// Returns whether a match was found.
290	// If the DFA runs out of memory, sets failed to true and returns false.*
291	// If matches != NULL and kind == kManyMatch and there is a match,
292	// SearchDFA fills matches with the match IDs of the final matching state.
293	bool SearchDFA(absl::string_view text, absl::string_view context,
294	Anchor anchor, MatchKind kind, absl::string_view* match0,
295	bool* failed, SparseSet* matches);
296
297	// The callback issued after building each DFA state with BuildEntireDFA().
298	// If next is null, then the memory budget has been exhausted and building
299	// will halt. Otherwise, the state has been built and next points to an array
300	// of bytemap_range()+1 slots holding the next states as per the bytemap and
301	// kByteEndText. The number of the state is implied by the callback sequence:
302	// the first callback is for state 0, the second callback is for state 1, ...
303	// match indicates whether the state is a matching state.
304	using DFAStateCallback = std::function<void(const int* next, bool match)>;
305
306	// Build the entire DFA for the given match kind.
307	// Usually the DFA is built out incrementally, as needed, which
308	// avoids lots of unnecessary work.
309	// If cb is not empty, it receives one callback per state built.
310	// Returns the number of states built.
311	// FOR TESTING OR EXPERIMENTAL PURPOSES ONLY.
312	int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb);
313
314	// Compute bytemap.
315	void ComputeByteMap();
316
317	// Run peep-hole optimizer on program.
318	void Optimize();
319
320	// One-pass NFA: only correct if IsOnePass() is true,
321	// but much faster than NFA (competitive with PCRE)
322	// for those expressions.
323	bool IsOnePass();
324	bool SearchOnePass(absl::string_view text, absl::string_view context,
325	Anchor anchor, MatchKind kind, absl::string_view* match,
326	int nmatch);
327
328	// Bit-state backtracking. Fast on small cases but uses memory
329	// proportional to the product of the list count and the text size.
330	bool CanBitState() { return list_heads_.data() != NULL; }
331	bool SearchBitState(absl::string_view text, absl::string_view context,
332	Anchor anchor, MatchKind kind, absl::string_view* match,
333	int nmatch);
334
335	static const int kMaxOnePassCapture = `5`; // $0 through $4
336
337	// Backtracking search: the gold standard against which the other
338	// implementations are checked. FOR TESTING ONLY.
339	// It allocates a ton of memory to avoid running forever.
340	// It is also recursive, so can't use in production (will overflow stacks).
341	// The name "Unsafe" here is supposed to be a flag that
342	// you should not be using this function.
343	bool UnsafeSearchBacktrack(absl::string_view text, absl::string_view context,
344	Anchor anchor, MatchKind kind,
345	absl::string_view* match, int nmatch);
346
347	// Computes range for any strings matching regexp. The min and max can in
348	// some cases be arbitrarily precise, so the caller gets to specify the
349	// maximum desired length of string returned.
350	//
351	// Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
352	// string s that is an anchored match for this regexp satisfies
353	// min <= s && s <= max.
354	//
355	// Note that PossibleMatchRange() will only consider the first copy of an
356	// infinitely repeated element (i.e., any regexp element followed by a '' or*
357	// '+' operator). Regexps with "{N}" constructions are not affected, as those
358	// do not compile down to infinite repetitions.
359	//
360	// Returns true on success, false on error.
361	bool PossibleMatchRange(std::string* min, std::string* max, int maxlen);
362
363	// Outputs the program fanout into the given sparse array.
364	void Fanout(SparseArray<int>* fanout);
365
366	// Compiles a collection of regexps to Prog. Each regexp will have
367	// its own Match instruction recording the index in the output vector.
368	static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
369
370	// Flattens the Prog from "tree" form to "list" form. This is an in-place
371	// operation in the sense that the old instructions are lost.
372	void Flatten();
373
374	// Walks the Prog; the "successor roots" or predecessors of the reachable
375	// instructions are marked in rootmap or predmap/predvec, respectively.
376	// reachable and stk are preallocated scratch structures.
377	void MarkSuccessors(SparseArray<int>* rootmap,
378	SparseArray<int>* predmap,
379	std::vector<std::vector<int>>* predvec,
380	SparseSet* reachable, std::vector<int>* stk);
381
382	// Walks the Prog from the given "root" instruction; the "dominator root"
383	// of the reachable instructions (if such exists) is marked in rootmap.
384	// reachable and stk are preallocated scratch structures.
385	void MarkDominator(int root, SparseArray<int>* rootmap,
386	SparseArray<int>* predmap,
387	std::vector<std::vector<int>>* predvec,
388	SparseSet* reachable, std::vector<int>* stk);
389
390	// Walks the Prog from the given "root" instruction; the reachable
391	// instructions are emitted in "list" form and appended to flat.
392	// reachable and stk are preallocated scratch structures.
393	void EmitList(int root, SparseArray<int>* rootmap,
394	std::vector<Inst>* flat,
395	SparseSet* reachable, std::vector<int>* stk);
396
397	// Computes hints for ByteRange instructions in [begin, end).
398	void ComputeHints(std::vector<Inst>* flat, int begin, int end);
399
400	// Controls whether the DFA should bail out early if the NFA would be faster.
401	// FOR TESTING ONLY.
402	static void TESTING_ONLY_set_dfa_should_bail_when_slow(bool b);
403
404	private:
405	friend class Compiler;
406
407	DFA* GetDFA(MatchKind kind);
408	void DeleteDFA(DFA* dfa);
409
410	bool anchor_start_; // regexp has explicit start anchor
411	bool anchor_end_; // regexp has explicit end anchor
412	bool reversed_; // whether program runs backward over input
413	bool did_flatten_; // has Flatten been called?
414	bool did_onepass_; // has IsOnePass been called?
415
416	int start_; // entry point for program
417	int start_unanchored_; // unanchored entry point for program
418	int size_; // number of instructions
419	int bytemap_range_; // bytemap_[x] < bytemap_range_
420
421	bool prefix_foldcase_; // whether prefix is case-insensitive
422	size_t prefix_size_; // size of prefix (0 if no prefix)
423	union {
424	uint64_t* prefix_dfa_; // "Shift DFA" for prefix
425	struct {
426	int prefix_front_; // first byte of prefix
427	int prefix_back_; // last byte of prefix
428	};
429	};
430
431	int list_count_; // count of lists (see above)
432	int inst_count_[kNumInst]; // count of instructions by opcode
433	PODArray<uint16_t> list_heads_; // sparse array enumerating list heads
434	// not populated if size_ is overly large
435	size_t bit_state_text_max_size_; // upper bound (inclusive) on text.size()
436
437	PODArray<Inst> inst_; // pointer to instruction array
438	PODArray<uint8_t> onepass_nodes_; // data for OnePass nodes
439
440	int64_t dfa_mem_; // Maximum memory for DFAs.
441	DFA* dfa_first_; // DFA cached for kFirstMatch/kManyMatch
442	DFA* dfa_longest_; // DFA cached for kLongestMatch/kFullMatch
443
444	uint8_t bytemap_[`256`]; // map from input bytes to byte classes
445
446	absl::once_flag dfa_first_once_;
447	absl::once_flag dfa_longest_once_;
448
449	Prog(const Prog&) = delete;
450	Prog& operator=(const Prog&) = delete;
451	};
452
453	// std::string_view in MSVC has iterators that aren't just pointers and
454	// that don't allow comparisons between different objects - not even if
455	// those objects are views into the same string! Thus, we provide these
456	// conversion functions for convenience.
457	static inline const char* BeginPtr(absl::string_view s) {
458	return s.data();
459	}
460	static inline const char* EndPtr(absl::string_view s) {
461	return s.data() + s.size();
462	}
463
464	} // namespace re2
465
466	#endif // RE2_PROG_H_
467

Browse the source code of tensorflow/external/com_googlesource_code_re2/re2/prog.h