1// Copyright 2008 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Tested by search_test.cc.
6//
7// Prog::SearchOnePass is an efficient implementation of
8// regular expression search with submatch tracking for
9// what I call "one-pass regular expressions". (An alternate
10// name might be "backtracking-free regular expressions".)
11//
12// One-pass regular expressions have the property that
13// at each input byte during an anchored match, there may be
14// multiple alternatives but only one can proceed for any
15// given input byte.
16//
17// For example, the regexp /x*yx*/ is one-pass: you read
18// x's until a y, then you read the y, then you keep reading x's.
19// At no point do you have to guess what to do or back up
20// and try a different guess.
21//
22// On the other hand, /x*x/ is not one-pass: when you're
23// looking at an input "x", it's not clear whether you should
24// use it to extend the x* or as the final x.
25//
26// More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not.
27// /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not.
28//
29// A simple intuition for identifying one-pass regular expressions
30// is that it's always immediately obvious when a repetition ends.
31// It must also be immediately obvious which branch of an | to take:
32//
33// /x(y|z)/ is one-pass, but /(xy|xz)/ is not.
34//
35// The NFA-based search in nfa.cc does some bookkeeping to
36// avoid the need for backtracking and its associated exponential blowup.
37// But if we have a one-pass regular expression, there is no
38// possibility of backtracking, so there is no need for the
39// extra bookkeeping. Hence, this code.
40//
41// On a one-pass regular expression, the NFA code in nfa.cc
42// runs at about 1/20 of the backtracking-based PCRE speed.
43// In contrast, the code in this file runs at about the same
44// speed as PCRE.
45//
46// One-pass regular expressions get used a lot when RE is
47// used for parsing simple strings, so it pays off to
48// notice them and handle them efficiently.
49//
50// See also Anne Brüggemann-Klein and Derick Wood,
51// "One-unambiguous regular languages", Information and Computation 142(2).
52
53#include <stdint.h>
54#include <string.h>
55#include <algorithm>
56#include <map>
57#include <string>
58#include <vector>
59
60#include "absl/container/fixed_array.h"
61#include "absl/container/inlined_vector.h"
62#include "absl/strings/str_format.h"
63#include "util/logging.h"
64#include "util/utf.h"
65#include "re2/pod_array.h"
66#include "re2/prog.h"
67#include "re2/sparse_set.h"
68
69// Silence "zero-sized array in struct/union" warning for OneState::action.
70#ifdef _MSC_VER
71#pragma warning(disable: 4200)
72#endif
73
74namespace re2 {
75
76static const bool ExtraDebug = false;
77
78// The key insight behind this implementation is that the
79// non-determinism in an NFA for a one-pass regular expression
80// is contained. To explain what that means, first a
81// refresher about what regular expression programs look like
82// and how the usual NFA execution runs.
83//
84// In a regular expression program, only the kInstByteRange
85// instruction processes an input byte c and moves on to the
86// next byte in the string (it does so if c is in the given range).
87// The kInstByteRange instructions correspond to literal characters
88// and character classes in the regular expression.
89//
90// The kInstAlt instructions are used as wiring to connect the
91// kInstByteRange instructions together in interesting ways when
92// implementing | + and *.
93// The kInstAlt instruction forks execution, like a goto that
94// jumps to ip->out() and ip->out1() in parallel. Each of the
95// resulting computation paths is called a thread.
96//
97// The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture --
98// are interesting in their own right but like kInstAlt they don't
99// advance the input pointer. Only kInstByteRange does.
100//
101// The automaton execution in nfa.cc runs all the possible
102// threads of execution in lock-step over the input. To process
103// a particular byte, each thread gets run until it either dies
104// or finds a kInstByteRange instruction matching the byte.
105// If the latter happens, the thread stops just past the
106// kInstByteRange instruction (at ip->out()) and waits for
107// the other threads to finish processing the input byte.
108// Then, once all the threads have processed that input byte,
109// the whole process repeats. The kInstAlt state instruction
110// might create new threads during input processing, but no
111// matter what, all the threads stop after a kInstByteRange
112// and wait for the other threads to "catch up".
113// Running in lock step like this ensures that the NFA reads
114// the input string only once.
115//
116// Each thread maintains its own set of capture registers
117// (the string positions at which it executed the kInstCapture
118// instructions corresponding to capturing parentheses in the
119// regular expression). Repeated copying of the capture registers
120// is the main performance bottleneck in the NFA implementation.
121//
122// A regular expression program is "one-pass" if, no matter what
123// the input string, there is only one thread that makes it
124// past a kInstByteRange instruction at each input byte. This means
125// that there is in some sense only one active thread throughout
126// the execution. Other threads might be created during the
127// processing of an input byte, but they are ephemeral: only one
128// thread is left to start processing the next input byte.
129// This is what I meant above when I said the non-determinism
130// was "contained".
131//
132// To execute a one-pass regular expression program, we can build
133// a DFA (no non-determinism) that has at most as many states as
134// the NFA (compare this to the possibly exponential number of states
135// in the general case). Each state records, for each possible
136// input byte, the next state along with the conditions required
137// before entering that state -- empty-width flags that must be true
138// and capture operations that must be performed. It also records
139// whether a set of conditions required to finish a match at that
140// point in the input rather than process the next byte.
141
142// A state in the one-pass NFA - just an array of actions indexed
143// by the bytemap_[] of the next input byte. (The bytemap
144// maps next input bytes into equivalence classes, to reduce
145// the memory footprint.)
146struct OneState {
147 uint32_t matchcond; // conditions to match right now.
148 uint32_t action[];
149};
150
151// The uint32_t conditions in the action are a combination of
152// condition and capture bits and the next state. The bottom 16 bits
153// are the condition and capture bits, and the top 16 are the index of
154// the next state.
155//
156// Bits 0-5 are the empty-width flags from prog.h.
157// Bit 6 is kMatchWins, which means the match takes
158// priority over moving to next in a first-match search.
159// The remaining bits mark capture registers that should
160// be set to the current input position. The capture bits
161// start at index 2, since the search loop can take care of
162// cap[0], cap[1] (the overall match position).
163// That means we can handle up to 5 capturing parens: $1 through $4, plus $0.
164// No input position can satisfy both kEmptyWordBoundary
165// and kEmptyNonWordBoundary, so we can use that as a sentinel
166// instead of needing an extra bit.
167
168static const int kIndexShift = 16; // number of bits below index
169static const int kEmptyShift = 6; // number of empty flags in prog.h
170static const int kRealCapShift = kEmptyShift + 1;
171static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2;
172
173// Parameters used to skip over cap[0], cap[1].
174static const int kCapShift = kRealCapShift - 2;
175static const int kMaxCap = kRealMaxCap + 2;
176
177static const uint32_t kMatchWins = 1 << kEmptyShift;
178static const uint32_t kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift;
179
180static const uint32_t kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary;
181
182// Check, at compile time, that prog.h agrees with math above.
183// This function is never called.
184void OnePass_Checks() {
185 static_assert((1<<kEmptyShift)-1 == kEmptyAllFlags,
186 "kEmptyShift disagrees with kEmptyAllFlags");
187 // kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
188 static_assert(kMaxCap == Prog::kMaxOnePassCapture*2,
189 "kMaxCap disagrees with kMaxOnePassCapture");
190}
191
192static bool Satisfy(uint32_t cond, absl::string_view context, const char* p) {
193 uint32_t satisfied = Prog::EmptyFlags(context, p);
194 if (cond & kEmptyAllFlags & ~satisfied)
195 return false;
196 return true;
197}
198
199// Apply the capture bits in cond, saving p to the appropriate
200// locations in cap[].
201static void ApplyCaptures(uint32_t cond, const char* p,
202 const char** cap, int ncap) {
203 for (int i = 2; i < ncap; i++)
204 if (cond & (1 << kCapShift << i))
205 cap[i] = p;
206}
207
208// Computes the OneState* for the given nodeindex.
209static inline OneState* IndexToNode(uint8_t* nodes, int statesize,
210 int nodeindex) {
211 return reinterpret_cast<OneState*>(nodes + statesize*nodeindex);
212}
213
214bool Prog::SearchOnePass(absl::string_view text, absl::string_view context,
215 Anchor anchor, MatchKind kind,
216 absl::string_view* match, int nmatch) {
217 if (anchor != kAnchored && kind != kFullMatch) {
218 LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches.";
219 return false;
220 }
221
222 // Make sure we have at least cap[1],
223 // because we use it to tell if we matched.
224 int ncap = 2*nmatch;
225 if (ncap < 2)
226 ncap = 2;
227
228 const char* cap[kMaxCap];
229 for (int i = 0; i < ncap; i++)
230 cap[i] = NULL;
231
232 const char* matchcap[kMaxCap];
233 for (int i = 0; i < ncap; i++)
234 matchcap[i] = NULL;
235
236 if (context.data() == NULL)
237 context = text;
238 if (anchor_start() && BeginPtr(context) != BeginPtr(text))
239 return false;
240 if (anchor_end() && EndPtr(context) != EndPtr(text))
241 return false;
242 if (anchor_end())
243 kind = kFullMatch;
244
245 uint8_t* nodes = onepass_nodes_.data();
246 int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);
247 // start() is always mapped to the zeroth OneState.
248 OneState* state = IndexToNode(nodes, statesize, 0);
249 uint8_t* bytemap = bytemap_;
250 const char* bp = text.data();
251 const char* ep = text.data() + text.size();
252 const char* p;
253 bool matched = false;
254 matchcap[0] = bp;
255 cap[0] = bp;
256 uint32_t nextmatchcond = state->matchcond;
257 for (p = bp; p < ep; p++) {
258 int c = bytemap[*p & 0xFF];
259 uint32_t matchcond = nextmatchcond;
260 uint32_t cond = state->action[c];
261
262 // Determine whether we can reach act->next.
263 // If so, advance state and nextmatchcond.
264 if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) {
265 uint32_t nextindex = cond >> kIndexShift;
266 state = IndexToNode(nodes, statesize, nextindex);
267 nextmatchcond = state->matchcond;
268 } else {
269 state = NULL;
270 nextmatchcond = kImpossible;
271 }
272
273 // This code section is carefully tuned.
274 // The goto sequence is about 10% faster than the
275 // obvious rewrite as a large if statement in the
276 // ASCIIMatchRE2 and DotMatchRE2 benchmarks.
277
278 // Saving the match capture registers is expensive.
279 // Is this intermediate match worth thinking about?
280
281 // Not if we want a full match.
282 if (kind == kFullMatch)
283 goto skipmatch;
284
285 // Not if it's impossible.
286 if (matchcond == kImpossible)
287 goto skipmatch;
288
289 // Not if the possible match is beaten by the certain
290 // match at the next byte. When this test is useless
291 // (e.g., HTTPPartialMatchRE2) it slows the loop by
292 // about 10%, but when it avoids work (e.g., DotMatchRE2),
293 // it cuts the loop execution by about 45%.
294 if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0)
295 goto skipmatch;
296
297 // Finally, the match conditions must be satisfied.
298 if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) {
299 for (int i = 2; i < 2*nmatch; i++)
300 matchcap[i] = cap[i];
301 if (nmatch > 1 && (matchcond & kCapMask))
302 ApplyCaptures(matchcond, p, matchcap, ncap);
303 matchcap[1] = p;
304 matched = true;
305
306 // If we're in longest match mode, we have to keep
307 // going and see if we find a longer match.
308 // In first match mode, we can stop if the match
309 // takes priority over the next state for this input byte.
310 // That bit is per-input byte and thus in cond, not matchcond.
311 if (kind == kFirstMatch && (cond & kMatchWins))
312 goto done;
313 }
314
315 skipmatch:
316 if (state == NULL)
317 goto done;
318 if ((cond & kCapMask) && nmatch > 1)
319 ApplyCaptures(cond, p, cap, ncap);
320 }
321
322 // Look for match at end of input.
323 {
324 uint32_t matchcond = state->matchcond;
325 if (matchcond != kImpossible &&
326 ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) {
327 if (nmatch > 1 && (matchcond & kCapMask))
328 ApplyCaptures(matchcond, p, cap, ncap);
329 for (int i = 2; i < ncap; i++)
330 matchcap[i] = cap[i];
331 matchcap[1] = p;
332 matched = true;
333 }
334 }
335
336done:
337 if (!matched)
338 return false;
339 for (int i = 0; i < nmatch; i++)
340 match[i] = absl::string_view(
341 matchcap[2 * i],
342 static_cast<size_t>(matchcap[2 * i + 1] - matchcap[2 * i]));
343 return true;
344}
345
346// Analysis to determine whether a given regexp program is one-pass.
347
348// If ip is not on workq, adds ip to work queue and returns true.
349// If ip is already on work queue, does nothing and returns false.
350// If ip is NULL, does nothing and returns true (pretends to add it).
351typedef SparseSet Instq;
352static bool AddQ(Instq *q, int id) {
353 if (id == 0)
354 return true;
355 if (q->contains(id))
356 return false;
357 q->insert(id);
358 return true;
359}
360
361struct InstCond {
362 int id;
363 uint32_t cond;
364};
365
366// Returns whether this is a one-pass program; that is,
367// returns whether it is safe to use SearchOnePass on this program.
368// These conditions must be true for any instruction ip:
369//
370// (1) for any other Inst nip, there is at most one input-free
371// path from ip to nip.
372// (2) there is at most one kInstByte instruction reachable from
373// ip that matches any particular byte c.
374// (3) there is at most one input-free path from ip to a kInstMatch
375// instruction.
376//
377// This is actually just a conservative approximation: it might
378// return false when the answer is true, when kInstEmptyWidth
379// instructions are involved.
380// Constructs and saves corresponding one-pass NFA on success.
381bool Prog::IsOnePass() {
382 if (did_onepass_)
383 return onepass_nodes_.data() != NULL;
384 did_onepass_ = true;
385
386 if (start() == 0) // no match
387 return false;
388
389 // Steal memory for the one-pass NFA from the overall DFA budget.
390 // Willing to use at most 1/4 of the DFA budget (heuristic).
391 // Limit max node count to 65000 as a conservative estimate to
392 // avoid overflowing 16-bit node index in encoding.
393 int maxnodes = 2 + inst_count(kInstByteRange);
394 int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);
395 if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
396 return false;
397
398 // Flood the graph starting at the start state, and check
399 // that in each reachable state, each possible byte leads
400 // to a unique next state.
401 int stacksize = inst_count(kInstCapture) +
402 inst_count(kInstEmptyWidth) +
403 inst_count(kInstNop) + 1; // + 1 for start inst
404 absl::FixedArray<InstCond, 64> stack_storage(stacksize);
405 InstCond* stack = stack_storage.data();
406
407 int size = this->size();
408 absl::FixedArray<int, 128> nodebyid_storage(size, -1); // indexed by ip
409 int* nodebyid = nodebyid_storage.data();
410
411 // Originally, nodes was a uint8_t[maxnodes*statesize], but that was
412 // unnecessarily optimistic: why allocate a large amount of memory
413 // upfront for a large program when it is unlikely to be one-pass?
414 absl::InlinedVector<uint8_t, 2048> nodes;
415
416 Instq tovisit(size), workq(size);
417 AddQ(&tovisit, start());
418 nodebyid[start()] = 0;
419 int nalloc = 1;
420 nodes.insert(nodes.end(), statesize, 0);
421 for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
422 int id = *it;
423 int nodeindex = nodebyid[id];
424 OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
425
426 // Flood graph using manual stack, filling in actions as found.
427 // Default is none.
428 for (int b = 0; b < bytemap_range_; b++)
429 node->action[b] = kImpossible;
430 node->matchcond = kImpossible;
431
432 workq.clear();
433 bool matched = false;
434 int nstack = 0;
435 stack[nstack].id = id;
436 stack[nstack++].cond = 0;
437 while (nstack > 0) {
438 int id = stack[--nstack].id;
439 uint32_t cond = stack[nstack].cond;
440
441 Loop:
442 Prog::Inst* ip = inst(id);
443 switch (ip->opcode()) {
444 default:
445 LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
446 break;
447
448 case kInstAltMatch:
449 // TODO(rsc): Ignoring kInstAltMatch optimization.
450 // Should implement it in this engine, but it's subtle.
451 DCHECK(!ip->last());
452 // If already on work queue, (1) is violated: bail out.
453 if (!AddQ(&workq, id+1))
454 goto fail;
455 id = id+1;
456 goto Loop;
457
458 case kInstByteRange: {
459 int nextindex = nodebyid[ip->out()];
460 if (nextindex == -1) {
461 if (nalloc >= maxnodes) {
462 if (ExtraDebug)
463 LOG(ERROR) << absl::StrFormat(
464 "Not OnePass: hit node limit %d >= %d", nalloc, maxnodes);
465 goto fail;
466 }
467 nextindex = nalloc;
468 AddQ(&tovisit, ip->out());
469 nodebyid[ip->out()] = nalloc;
470 nalloc++;
471 nodes.insert(nodes.end(), statesize, 0);
472 // Update node because it might have been invalidated.
473 node = IndexToNode(nodes.data(), statesize, nodeindex);
474 }
475 for (int c = ip->lo(); c <= ip->hi(); c++) {
476 int b = bytemap_[c];
477 // Skip any bytes immediately after c that are also in b.
478 while (c < 256-1 && bytemap_[c+1] == b)
479 c++;
480 uint32_t act = node->action[b];
481 uint32_t newact = (nextindex << kIndexShift) | cond;
482 if (matched)
483 newact |= kMatchWins;
484 if ((act & kImpossible) == kImpossible) {
485 node->action[b] = newact;
486 } else if (act != newact) {
487 if (ExtraDebug)
488 LOG(ERROR) << absl::StrFormat(
489 "Not OnePass: conflict on byte %#x at state %d", c, *it);
490 goto fail;
491 }
492 }
493 if (ip->foldcase()) {
494 Rune lo = std::max<Rune>(ip->lo(), 'a') + 'A' - 'a';
495 Rune hi = std::min<Rune>(ip->hi(), 'z') + 'A' - 'a';
496 for (int c = lo; c <= hi; c++) {
497 int b = bytemap_[c];
498 // Skip any bytes immediately after c that are also in b.
499 while (c < 256-1 && bytemap_[c+1] == b)
500 c++;
501 uint32_t act = node->action[b];
502 uint32_t newact = (nextindex << kIndexShift) | cond;
503 if (matched)
504 newact |= kMatchWins;
505 if ((act & kImpossible) == kImpossible) {
506 node->action[b] = newact;
507 } else if (act != newact) {
508 if (ExtraDebug)
509 LOG(ERROR) << absl::StrFormat(
510 "Not OnePass: conflict on byte %#x at state %d", c, *it);
511 goto fail;
512 }
513 }
514 }
515
516 if (ip->last())
517 break;
518 // If already on work queue, (1) is violated: bail out.
519 if (!AddQ(&workq, id+1))
520 goto fail;
521 id = id+1;
522 goto Loop;
523 }
524
525 case kInstCapture:
526 case kInstEmptyWidth:
527 case kInstNop:
528 if (!ip->last()) {
529 // If already on work queue, (1) is violated: bail out.
530 if (!AddQ(&workq, id+1))
531 goto fail;
532 stack[nstack].id = id+1;
533 stack[nstack++].cond = cond;
534 }
535
536 if (ip->opcode() == kInstCapture && ip->cap() < kMaxCap)
537 cond |= (1 << kCapShift) << ip->cap();
538 if (ip->opcode() == kInstEmptyWidth)
539 cond |= ip->empty();
540
541 // kInstCapture and kInstNop always proceed to ip->out().
542 // kInstEmptyWidth only sometimes proceeds to ip->out(),
543 // but as a conservative approximation we assume it always does.
544 // We could be a little more precise by looking at what c
545 // is, but that seems like overkill.
546
547 // If already on work queue, (1) is violated: bail out.
548 if (!AddQ(&workq, ip->out())) {
549 if (ExtraDebug)
550 LOG(ERROR) << absl::StrFormat(
551 "Not OnePass: multiple paths %d -> %d", *it, ip->out());
552 goto fail;
553 }
554 id = ip->out();
555 goto Loop;
556
557 case kInstMatch:
558 if (matched) {
559 // (3) is violated
560 if (ExtraDebug)
561 LOG(ERROR) << absl::StrFormat(
562 "Not OnePass: multiple matches from %d", *it);
563 goto fail;
564 }
565 matched = true;
566 node->matchcond = cond;
567
568 if (ip->last())
569 break;
570 // If already on work queue, (1) is violated: bail out.
571 if (!AddQ(&workq, id+1))
572 goto fail;
573 id = id+1;
574 goto Loop;
575
576 case kInstFail:
577 break;
578 }
579 }
580 }
581
582 if (ExtraDebug) { // For debugging, dump one-pass NFA to LOG(ERROR).
583 LOG(ERROR) << "bytemap:\n" << DumpByteMap();
584 LOG(ERROR) << "prog:\n" << Dump();
585
586 std::map<int, int> idmap;
587 for (int i = 0; i < size; i++)
588 if (nodebyid[i] != -1)
589 idmap[nodebyid[i]] = i;
590
591 std::string dump;
592 for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
593 int id = *it;
594 int nodeindex = nodebyid[id];
595 if (nodeindex == -1)
596 continue;
597 OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
598 dump += absl::StrFormat("node %d id=%d: matchcond=%#x\n",
599 nodeindex, id, node->matchcond);
600 for (int i = 0; i < bytemap_range_; i++) {
601 if ((node->action[i] & kImpossible) == kImpossible)
602 continue;
603 dump += absl::StrFormat(" %d cond %#x -> %d id=%d\n",
604 i, node->action[i] & 0xFFFF,
605 node->action[i] >> kIndexShift,
606 idmap[node->action[i] >> kIndexShift]);
607 }
608 }
609 LOG(ERROR) << "nodes:\n" << dump;
610 }
611
612 dfa_mem_ -= nalloc*statesize;
613 onepass_nodes_ = PODArray<uint8_t>(nalloc*statesize);
614 memmove(onepass_nodes_.data(), nodes.data(), nalloc*statesize);
615 return true;
616
617fail:
618 return false;
619}
620
621} // namespace re2
622