onepass.cc source code [tensorflow/external/com_googlesource_code_re2/re2/onepass.cc]

1	// Copyright 2008 The RE2 Authors. All Rights Reserved.
2	// Use of this source code is governed by a BSD-style
3	// license that can be found in the LICENSE file.
4
5	// Tested by search_test.cc.
6	//
7	// Prog::SearchOnePass is an efficient implementation of
8	// regular expression search with submatch tracking for
9	// what I call "one-pass regular expressions". (An alternate
10	// name might be "backtracking-free regular expressions".)
11	//
12	// One-pass regular expressions have the property that
13	// at each input byte during an anchored match, there may be
14	// multiple alternatives but only one can proceed for any
15	// given input byte.
16	//
17	// For example, the regexp /xyx/ is one-pass: you read
18	// x's until a y, then you read the y, then you keep reading x's.
19	// At no point do you have to guess what to do or back up
20	// and try a different guess.
21	//
22	// On the other hand, /xx/ is not one-pass: when you're*
23	// looking at an input "x", it's not clear whether you should
24	// use it to extend the x or as the final x.*
25	//
26	// More examples: /([^ ]) (.)/ is one-pass; /(.) (.)/ is not.
27	// /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not.
28	//
29	// A simple intuition for identifying one-pass regular expressions
30	// is that it's always immediately obvious when a repetition ends.
31	// It must also be immediately obvious which branch of an \| to take:
32	//
33	// /x(y\|z)/ is one-pass, but /(xy\|xz)/ is not.
34	//
35	// The NFA-based search in nfa.cc does some bookkeeping to
36	// avoid the need for backtracking and its associated exponential blowup.
37	// But if we have a one-pass regular expression, there is no
38	// possibility of backtracking, so there is no need for the
39	// extra bookkeeping. Hence, this code.
40	//
41	// On a one-pass regular expression, the NFA code in nfa.cc
42	// runs at about 1/20 of the backtracking-based PCRE speed.
43	// In contrast, the code in this file runs at about the same
44	// speed as PCRE.
45	//
46	// One-pass regular expressions get used a lot when RE is
47	// used for parsing simple strings, so it pays off to
48	// notice them and handle them efficiently.
49	//
50	// See also Anne Brüggemann-Klein and Derick Wood,
51	// "One-unambiguous regular languages", Information and Computation 142(2).
52
53	#include <stdint.h>
54	#include <string.h>
55	#include <algorithm>
56	#include <map>
57	#include <string>
58	#include <vector>
59
60	#include "absl/container/fixed_array.h"
61	#include "absl/container/inlined_vector.h"
62	#include "absl/strings/str_format.h"
63	#include "util/logging.h"
64	#include "util/utf.h"
65	#include "re2/pod_array.h"
66	#include "re2/prog.h"
67	#include "re2/sparse_set.h"
68
69	// Silence "zero-sized array in struct/union" warning for OneState::action.
70	#ifdef _MSC_VER
71	#pragma warning(disable: 4200)
72	#endif
73
74	namespace re2 {
75
76	static const bool ExtraDebug = false;
77
78	// The key insight behind this implementation is that the
79	// non-determinism in an NFA for a one-pass regular expression
80	// is contained. To explain what that means, first a
81	// refresher about what regular expression programs look like
82	// and how the usual NFA execution runs.
83	//
84	// In a regular expression program, only the kInstByteRange
85	// instruction processes an input byte c and moves on to the
86	// next byte in the string (it does so if c is in the given range).
87	// The kInstByteRange instructions correspond to literal characters
88	// and character classes in the regular expression.
89	//
90	// The kInstAlt instructions are used as wiring to connect the
91	// kInstByteRange instructions together in interesting ways when
92	// implementing \| + and .*
93	// The kInstAlt instruction forks execution, like a goto that
94	// jumps to ip->out() and ip->out1() in parallel. Each of the
95	// resulting computation paths is called a thread.
96	//
97	// The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture --
98	// are interesting in their own right but like kInstAlt they don't
99	// advance the input pointer. Only kInstByteRange does.
100	//
101	// The automaton execution in nfa.cc runs all the possible
102	// threads of execution in lock-step over the input. To process
103	// a particular byte, each thread gets run until it either dies
104	// or finds a kInstByteRange instruction matching the byte.
105	// If the latter happens, the thread stops just past the
106	// kInstByteRange instruction (at ip->out()) and waits for
107	// the other threads to finish processing the input byte.
108	// Then, once all the threads have processed that input byte,
109	// the whole process repeats. The kInstAlt state instruction
110	// might create new threads during input processing, but no
111	// matter what, all the threads stop after a kInstByteRange
112	// and wait for the other threads to "catch up".
113	// Running in lock step like this ensures that the NFA reads
114	// the input string only once.
115	//
116	// Each thread maintains its own set of capture registers
117	// (the string positions at which it executed the kInstCapture
118	// instructions corresponding to capturing parentheses in the
119	// regular expression). Repeated copying of the capture registers
120	// is the main performance bottleneck in the NFA implementation.
121	//
122	// A regular expression program is "one-pass" if, no matter what
123	// the input string, there is only one thread that makes it
124	// past a kInstByteRange instruction at each input byte. This means
125	// that there is in some sense only one active thread throughout
126	// the execution. Other threads might be created during the
127	// processing of an input byte, but they are ephemeral: only one
128	// thread is left to start processing the next input byte.
129	// This is what I meant above when I said the non-determinism
130	// was "contained".
131	//
132	// To execute a one-pass regular expression program, we can build
133	// a DFA (no non-determinism) that has at most as many states as
134	// the NFA (compare this to the possibly exponential number of states
135	// in the general case). Each state records, for each possible
136	// input byte, the next state along with the conditions required
137	// before entering that state -- empty-width flags that must be true
138	// and capture operations that must be performed. It also records
139	// whether a set of conditions required to finish a match at that
140	// point in the input rather than process the next byte.
141
142	// A state in the one-pass NFA - just an array of actions indexed
143	// by the bytemap_[] of the next input byte. (The bytemap
144	// maps next input bytes into equivalence classes, to reduce
145	// the memory footprint.)
146	struct OneState {
147	uint32_t matchcond; // conditions to match right now.
148	uint32_t action[];
149	};
150
151	// The uint32_t conditions in the action are a combination of
152	// condition and capture bits and the next state. The bottom 16 bits
153	// are the condition and capture bits, and the top 16 are the index of
154	// the next state.
155	//
156	// Bits 0-5 are the empty-width flags from prog.h.
157	// Bit 6 is kMatchWins, which means the match takes
158	// priority over moving to next in a first-match search.
159	// The remaining bits mark capture registers that should
160	// be set to the current input position. The capture bits
161	// start at index 2, since the search loop can take care of
162	// cap[0], cap[1] (the overall match position).
163	// That means we can handle up to 5 capturing parens: $1 through $4, plus $0.
164	// No input position can satisfy both kEmptyWordBoundary
165	// and kEmptyNonWordBoundary, so we can use that as a sentinel
166	// instead of needing an extra bit.
167
168	static const int kIndexShift = `16`; // number of bits below index
169	static const int kEmptyShift = `6`; // number of empty flags in prog.h
170	static const int kRealCapShift = kEmptyShift + `1`;
171	static const int kRealMaxCap = (kIndexShift - kRealCapShift) / `2` * `2`;
172
173	// Parameters used to skip over cap[0], cap[1].
174	static const int kCapShift = kRealCapShift - `2`;
175	static const int kMaxCap = kRealMaxCap + `2`;
176
177	static const uint32_t kMatchWins = `1` << kEmptyShift;
178	static const uint32_t kCapMask = ((`1` << kRealMaxCap) - `1`) << kRealCapShift;
179
180	static const uint32_t kImpossible = kEmptyWordBoundary \| kEmptyNonWordBoundary;
181
182	// Check, at compile time, that prog.h agrees with math above.
183	// This function is never called.
184	void OnePass_Checks() {
185	static_assert((`1`<<kEmptyShift)-`1` == kEmptyAllFlags,
186	"kEmptyShift disagrees with kEmptyAllFlags");
187	// kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
188	static_assert(kMaxCap == Prog::kMaxOnePassCapture*`2`,
189	"kMaxCap disagrees with kMaxOnePassCapture");
190	}
191
192	static bool Satisfy(uint32_t cond, absl::string_view context, const char* p) {
193	uint32_t satisfied = Prog::EmptyFlags(context, p);
194	if (cond & kEmptyAllFlags & ~satisfied)
195	return false;
196	return true;
197	}
198
199	// Apply the capture bits in cond, saving p to the appropriate
200	// locations in cap[].
201	static void ApplyCaptures(uint32_t cond, const char* p,
202	const char** cap, int ncap) {
203	for (int i = `2`; i < ncap; i++)
204	if (cond & (`1` << kCapShift << i))
205	cap[i] = p;
206	}
207
208	// Computes the OneState for the given nodeindex.*
209	static inline OneState* IndexToNode(uint8_t* nodes, int statesize,
210	int nodeindex) {
211	return reinterpret_cast<OneState>(nodes + statesizenodeindex);
212	}
213
214	bool Prog::SearchOnePass(absl::string_view text, absl::string_view context,
215	Anchor anchor, MatchKind kind,
216	absl::string_view* match, int nmatch) {
217	if (anchor != kAnchored && kind != kFullMatch) {
218	LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches.";
219	return false;
220	}
221
222	// Make sure we have at least cap[1],
223	// because we use it to tell if we matched.
224	int ncap = `2`*nmatch;
225	if (ncap < `2`)
226	ncap = `2`;
227
228	const char* cap[kMaxCap];
229	for (int i = `0`; i < ncap; i++)
230	cap[i] = NULL;
231
232	const char* matchcap[kMaxCap];
233	for (int i = `0`; i < ncap; i++)
234	matchcap[i] = NULL;
235
236	if (context.data() == NULL)
237	context = text;
238	if (anchor_start() && BeginPtr(context) != BeginPtr(text))
239	return false;
240	if (anchor_end() && EndPtr(context) != EndPtr(text))
241	return false;
242	if (anchor_end())
243	kind = kFullMatch;
244
245	uint8_t* nodes = onepass_nodes_.data();
246	int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);
247	// start() is always mapped to the zeroth OneState.
248	OneState* state = IndexToNode(nodes, statesize, `0`);
249	uint8_t* bytemap = bytemap_;
250	const char* bp = text.data();
251	const char* ep = text.data() + text.size();
252	const char* p;
253	bool matched = false;
254	matchcap[`0`] = bp;
255	cap[`0`] = bp;
256	uint32_t nextmatchcond = state->matchcond;
257	for (p = bp; p < ep; p++) {
258	int c = bytemap[*p & `0xFF`];
259	uint32_t matchcond = nextmatchcond;
260	uint32_t cond = state->action[c];
261
262	// Determine whether we can reach act->next.
263	// If so, advance state and nextmatchcond.
264	if ((cond & kEmptyAllFlags) == `0` \|\| Satisfy(cond, context, p)) {
265	uint32_t nextindex = cond >> kIndexShift;
266	state = IndexToNode(nodes, statesize, nextindex);
267	nextmatchcond = state->matchcond;
268	} else {
269	state = NULL;
270	nextmatchcond = kImpossible;
271	}
272
273	// This code section is carefully tuned.
274	// The goto sequence is about 10% faster than the
275	// obvious rewrite as a large if statement in the
276	// ASCIIMatchRE2 and DotMatchRE2 benchmarks.
277
278	// Saving the match capture registers is expensive.
279	// Is this intermediate match worth thinking about?
280
281	// Not if we want a full match.
282	if (kind == kFullMatch)
283	goto skipmatch;
284
285	// Not if it's impossible.
286	if (matchcond == kImpossible)
287	goto skipmatch;
288
289	// Not if the possible match is beaten by the certain
290	// match at the next byte. When this test is useless
291	// (e.g., HTTPPartialMatchRE2) it slows the loop by
292	// about 10%, but when it avoids work (e.g., DotMatchRE2),
293	// it cuts the loop execution by about 45%.
294	if ((cond & kMatchWins) == `0` && (nextmatchcond & kEmptyAllFlags) == `0`)
295	goto skipmatch;
296
297	// Finally, the match conditions must be satisfied.
298	if ((matchcond & kEmptyAllFlags) == `0` \|\| Satisfy(matchcond, context, p)) {
299	for (int i = `2`; i < `2`*nmatch; i++)
300	matchcap[i] = cap[i];
301	if (nmatch > `1` && (matchcond & kCapMask))
302	ApplyCaptures(matchcond, p, matchcap, ncap);
303	matchcap[`1`] = p;
304	matched = true;
305
306	// If we're in longest match mode, we have to keep
307	// going and see if we find a longer match.
308	// In first match mode, we can stop if the match
309	// takes priority over the next state for this input byte.
310	// That bit is per-input byte and thus in cond, not matchcond.
311	if (kind == kFirstMatch && (cond & kMatchWins))
312	goto done;
313	}
314
315	skipmatch:
316	if (state == NULL)
317	goto done;
318	if ((cond & kCapMask) && nmatch > `1`)
319	ApplyCaptures(cond, p, cap, ncap);
320	}
321
322	// Look for match at end of input.
323	{
324	uint32_t matchcond = state->matchcond;
325	if (matchcond != kImpossible &&
326	((matchcond & kEmptyAllFlags) == `0` \|\| Satisfy(matchcond, context, p))) {
327	if (nmatch > `1` && (matchcond & kCapMask))
328	ApplyCaptures(matchcond, p, cap, ncap);
329	for (int i = `2`; i < ncap; i++)
330	matchcap[i] = cap[i];
331	matchcap[`1`] = p;
332	matched = true;
333	}
334	}
335
336	done:
337	if (!matched)
338	return false;
339	for (int i = `0`; i < nmatch; i++)
340	match[i] = absl::string_view (
341	matchcap[`2` * i],
342	static_cast<size_t>(matchcap[`2` * i + `1`] - matchcap[`2` * i]));
343	return true;
344	}
345
346	// Analysis to determine whether a given regexp program is one-pass.
347
348	// If ip is not on workq, adds ip to work queue and returns true.
349	// If ip is already on work queue, does nothing and returns false.
350	// If ip is NULL, does nothing and returns true (pretends to add it).
351	typedef SparseSet Instq;
352	static bool AddQ(Instq q, int* id) {
353	if (id == `0`)
354	return true;
355	if (q->contains(id))
356	return false;
357	q->insert(id);
358	return true;
359	}
360
361	struct InstCond {
362	int id;
363	uint32_t cond;
364	};
365
366	// Returns whether this is a one-pass program; that is,
367	// returns whether it is safe to use SearchOnePass on this program.
368	// These conditions must be true for any instruction ip:
369	//
370	// (1) for any other Inst nip, there is at most one input-free
371	// path from ip to nip.
372	// (2) there is at most one kInstByte instruction reachable from
373	// ip that matches any particular byte c.
374	// (3) there is at most one input-free path from ip to a kInstMatch
375	// instruction.
376	//
377	// This is actually just a conservative approximation: it might
378	// return false when the answer is true, when kInstEmptyWidth
379	// instructions are involved.
380	// Constructs and saves corresponding one-pass NFA on success.
381	bool Prog::IsOnePass() {
382	if (did_onepass_)
383	return onepass_nodes_.data() != NULL;
384	did_onepass_ = true;
385
386	if (start() == `0`) // no match
387	return false;
388
389	// Steal memory for the one-pass NFA from the overall DFA budget.
390	// Willing to use at most 1/4 of the DFA budget (heuristic).
391	// Limit max node count to 65000 as a conservative estimate to
392	// avoid overflowing 16-bit node index in encoding.
393	int maxnodes = `2` + inst_count(kInstByteRange);
394	int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);
395	if (maxnodes >= `65000` \|\| dfa_mem_ / `4` / statesize < maxnodes)
396	return false;
397
398	// Flood the graph starting at the start state, and check
399	// that in each reachable state, each possible byte leads
400	// to a unique next state.
401	int stacksize = inst_count(kInstCapture) +
402	inst_count(kInstEmptyWidth) +
403	inst_count(kInstNop) + `1`; // + 1 for start inst
404	absl::FixedArray<InstCond, `64`> stack_storage(stacksize);
405	InstCond* stack = stack_storage.data();
406
407	int size = this->size();
408	absl::FixedArray<int, `128`> nodebyid_storage(size, -`1`); // indexed by ip
409	int* nodebyid = nodebyid_storage.data();
410
411	// Originally, nodes was a uint8_t[maxnodesstatesize], but that was*
412	// unnecessarily optimistic: why allocate a large amount of memory
413	// upfront for a large program when it is unlikely to be one-pass?
414	absl::InlinedVector<uint8_t, `2048`> nodes;
415
416	Instq tovisit(size), workq(size);
417	AddQ(&tovisit, start());
418	nodebyid[start()] = `0`;
419	int nalloc = `1`;
420	nodes.insert(nodes.end(), statesize, `0`);
421	for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
422	int id = *it;
423	int nodeindex = nodebyid[id];
424	OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
425
426	// Flood graph using manual stack, filling in actions as found.
427	// Default is none.
428	for (int b = `0`; b < bytemap_range_; b++)
429	node->action[b] = kImpossible;
430	node->matchcond = kImpossible;
431
432	workq.clear();
433	bool matched = false;
434	int nstack = `0`;
435	stack[nstack].id = id;
436	stack[nstack++].cond = `0`;
437	while (nstack > `0`) {
438	int id = stack[--nstack].id;
439	uint32_t cond = stack[nstack].cond;
440
441	Loop:
442	Prog::Inst* ip = inst(id);
443	switch (ip->opcode()) {
444	default:
445	LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
446	break;
447
448	case kInstAltMatch:
449	// TODO(rsc): Ignoring kInstAltMatch optimization.
450	// Should implement it in this engine, but it's subtle.
451	DCHECK(!ip->last());
452	// If already on work queue, (1) is violated: bail out.
453	if (!AddQ(&workq, id+`1`))
454	goto fail;
455	id = id+`1`;
456	goto Loop;
457
458	case kInstByteRange: {
459	int nextindex = nodebyid[ip->out()];
460	if (nextindex == -`1`) {
461	if (nalloc >= maxnodes) {
462	if (ExtraDebug)
463	LOG(ERROR) << absl::StrFormat(
464	"Not OnePass: hit node limit %d >= %d", nalloc, maxnodes);
465	goto fail;
466	}
467	nextindex = nalloc;
468	AddQ(&tovisit, ip->out());
469	nodebyid[ip->out()] = nalloc;
470	nalloc++;
471	nodes.insert(nodes.end(), statesize, `0`);
472	// Update node because it might have been invalidated.
473	node = IndexToNode(nodes.data(), statesize, nodeindex);
474	}
475	for (int c = ip->lo(); c <= ip->hi(); c++) {
476	int b = bytemap_[c];
477	// Skip any bytes immediately after c that are also in b.
478	while (c < `256`-`1` && bytemap_[c+`1`] == b)
479	c++;
480	uint32_t act = node->action[b];
481	uint32_t newact = (nextindex << kIndexShift) \| cond;
482	if (matched)
483	newact \|= kMatchWins;
484	if ((act & kImpossible) == kImpossible) {
485	node->action[b] = newact;
486	} else if (act != newact) {
487	if (ExtraDebug)
488	LOG(ERROR) << absl::StrFormat(
489	"Not OnePass: conflict on byte %#x at state %d", c, *it);
490	goto fail;
491	}
492	}
493	if (ip->foldcase()) {
494	Rune lo = std::max<Rune>(ip->lo(), `'a'`) + `'A'` - `'a'`;
495	Rune hi = std::min<Rune>(ip->hi(), `'z'`) + `'A'` - `'a'`;
496	for (int c = lo; c <= hi; c++) {
497	int b = bytemap_[c];
498	// Skip any bytes immediately after c that are also in b.
499	while (c < `256`-`1` && bytemap_[c+`1`] == b)
500	c++;
501	uint32_t act = node->action[b];
502	uint32_t newact = (nextindex << kIndexShift) \| cond;
503	if (matched)
504	newact \|= kMatchWins;
505	if ((act & kImpossible) == kImpossible) {
506	node->action[b] = newact;
507	} else if (act != newact) {
508	if (ExtraDebug)
509	LOG(ERROR) << absl::StrFormat(
510	"Not OnePass: conflict on byte %#x at state %d", c, *it);
511	goto fail;
512	}
513	}
514	}
515
516	if (ip->last())
517	break;
518	// If already on work queue, (1) is violated: bail out.
519	if (!AddQ(&workq, id+`1`))
520	goto fail;
521	id = id+`1`;
522	goto Loop;
523	}
524
525	case kInstCapture:
526	case kInstEmptyWidth:
527	case kInstNop:
528	if (!ip->last()) {
529	// If already on work queue, (1) is violated: bail out.
530	if (!AddQ(&workq, id+`1`))
531	goto fail;
532	stack[nstack].id = id+`1`;
533	stack[nstack++].cond = cond;
534	}
535
536	if (ip->opcode() == kInstCapture && ip->cap() < kMaxCap)
537	cond \|= (`1` << kCapShift) << ip->cap();
538	if (ip->opcode() == kInstEmptyWidth)
539	cond \|= ip->empty();
540
541	// kInstCapture and kInstNop always proceed to ip->out().
542	// kInstEmptyWidth only sometimes proceeds to ip->out(),
543	// but as a conservative approximation we assume it always does.
544	// We could be a little more precise by looking at what c
545	// is, but that seems like overkill.
546
547	// If already on work queue, (1) is violated: bail out.
548	if (!AddQ(&workq, ip->out())) {
549	if (ExtraDebug)
550	LOG(ERROR) << absl::StrFormat(
551	"Not OnePass: multiple paths %d -> %d", *it, ip->out());
552	goto fail;
553	}
554	id = ip->out();
555	goto Loop;
556
557	case kInstMatch:
558	if (matched) {
559	// (3) is violated
560	if (ExtraDebug)
561	LOG(ERROR) << absl::StrFormat(
562	"Not OnePass: multiple matches from %d", *it);
563	goto fail;
564	}
565	matched = true;
566	node->matchcond = cond;
567
568	if (ip->last())
569	break;
570	// If already on work queue, (1) is violated: bail out.
571	if (!AddQ(&workq, id+`1`))
572	goto fail;
573	id = id+`1`;
574	goto Loop;
575
576	case kInstFail:
577	break;
578	}
579	}
580	}
581
582	if (ExtraDebug) { // For debugging, dump one-pass NFA to LOG(ERROR).
583	LOG(ERROR) << "bytemap:\n" << DumpByteMap();
584	LOG(ERROR) << "prog:\n" << Dump();
585
586	std::map<int, int> idmap;
587	for (int i = `0`; i < size; i++)
588	if (nodebyid[i] != -`1`)
589	idmap [nodebyid[i]] = i;
590
591	std::string dump;
592	for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
593	int id = *it;
594	int nodeindex = nodebyid[id];
595	if (nodeindex == -`1`)
596	continue;
597	OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
598	dump += absl::StrFormat("node %d id=%d: matchcond=%#x\n",
599	nodeindex, id, node->matchcond);
600	for (int i = `0`; i < bytemap_range_; i++) {
601	if ((node->action[i] & kImpossible) == kImpossible)
602	continue;
603	dump += absl::StrFormat(" %d cond %#x -> %d id=%d\n",
604	i, node->action[i] & `0xFFFF`,
605	node->action[i] >> kIndexShift,
606	idmap [node->action[i] >> kIndexShift]);
607	}
608	}
609	LOG(ERROR) << "nodes:\n" << dump;
610	}
611
612	dfa_mem_ -= nalloc*statesize;
613	onepass_nodes_ = PODArray<uint8_t>(nalloc*statesize);
614	memmove(onepass_nodes_.data(), nodes.data(), nalloc*statesize);
615	return true;
616
617	fail:
618	return false;
619	}
620
621	} // namespace re2
622

Browse the source code of tensorflow/external/com_googlesource_code_re2/re2/onepass.cc