1// Copyright 2008 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Tested by search_test.cc, exhaustive_test.cc, tester.cc
6
7// Prog::SearchBitState is a regular expression search with submatch
8// tracking for small regular expressions and texts. Similarly to
9// testing/backtrack.cc, it allocates a bitmap with (count of
10// lists) * (length of text) bits to make sure it never explores the
11// same (instruction list, character position) multiple times. This
12// limits the search to run in time linear in the length of the text.
13//
14// Unlike testing/backtrack.cc, SearchBitState is not recursive
15// on the text.
16//
17// SearchBitState is a fast replacement for the NFA code on small
18// regexps and texts when SearchOnePass cannot be used.
19
20#include <stddef.h>
21#include <stdint.h>
22#include <string.h>
23#include <limits>
24#include <utility>
25
26#include "util/logging.h"
27#include "re2/pod_array.h"
28#include "re2/prog.h"
29#include "re2/regexp.h"
30
31namespace re2 {
32
33struct Job {
34 int id;
35 int rle; // run length encoding
36 const char* p;
37};
38
39class BitState {
40 public:
41 explicit BitState(Prog* prog);
42
43 // The usual Search prototype.
44 // Can only call Search once per BitState.
45 bool Search(absl::string_view text, absl::string_view context, bool anchored,
46 bool longest, absl::string_view* submatch, int nsubmatch);
47
48 private:
49 inline bool ShouldVisit(int id, const char* p);
50 void Push(int id, const char* p);
51 void GrowStack();
52 bool TrySearch(int id, const char* p);
53
54 // Search parameters
55 Prog* prog_; // program being run
56 absl::string_view text_; // text being searched
57 absl::string_view context_; // greater context of text being searched
58 bool anchored_; // whether search is anchored at text.begin()
59 bool longest_; // whether search wants leftmost-longest match
60 bool endmatch_; // whether match must end at text.end()
61 absl::string_view* submatch_; // submatches to fill in
62 int nsubmatch_; // # of submatches to fill in
63
64 // Search state
65 static constexpr int kVisitedBits = 64;
66 PODArray<uint64_t> visited_; // bitmap: (list ID, char*) pairs visited
67 PODArray<const char*> cap_; // capture registers
68 PODArray<Job> job_; // stack of text positions to explore
69 int njob_; // stack size
70
71 BitState(const BitState&) = delete;
72 BitState& operator=(const BitState&) = delete;
73};
74
75BitState::BitState(Prog* prog)
76 : prog_(prog),
77 anchored_(false),
78 longest_(false),
79 endmatch_(false),
80 submatch_(NULL),
81 nsubmatch_(0),
82 njob_(0) {
83}
84
85// Given id, which *must* be a list head, we can look up its list ID.
86// Then the question is: Should the search visit the (list ID, p) pair?
87// If so, remember that it was visited so that the next time,
88// we don't repeat the visit.
89bool BitState::ShouldVisit(int id, const char* p) {
90 int n = prog_->list_heads()[id] * static_cast<int>(text_.size()+1) +
91 static_cast<int>(p-text_.data());
92 if (visited_[n/kVisitedBits] & (uint64_t{1} << (n & (kVisitedBits-1))))
93 return false;
94 visited_[n/kVisitedBits] |= uint64_t{1} << (n & (kVisitedBits-1));
95 return true;
96}
97
98// Grow the stack.
99void BitState::GrowStack() {
100 PODArray<Job> tmp(2*job_.size());
101 memmove(tmp.data(), job_.data(), njob_*sizeof job_[0]);
102 job_ = std::move(tmp);
103}
104
105// Push (id, p) onto the stack, growing it if necessary.
106void BitState::Push(int id, const char* p) {
107 if (njob_ >= job_.size()) {
108 GrowStack();
109 if (njob_ >= job_.size()) {
110 LOG(DFATAL) << "GrowStack() failed: "
111 << "njob_ = " << njob_ << ", "
112 << "job_.size() = " << job_.size();
113 return;
114 }
115 }
116
117 // If id < 0, it's undoing a Capture,
118 // so we mustn't interfere with that.
119 if (id >= 0 && njob_ > 0) {
120 Job* top = &job_[njob_-1];
121 if (id == top->id &&
122 p == top->p + top->rle + 1 &&
123 top->rle < std::numeric_limits<int>::max()) {
124 ++top->rle;
125 return;
126 }
127 }
128
129 Job* top = &job_[njob_++];
130 top->id = id;
131 top->rle = 0;
132 top->p = p;
133}
134
135// Try a search from instruction id0 in state p0.
136// Return whether it succeeded.
137bool BitState::TrySearch(int id0, const char* p0) {
138 bool matched = false;
139 const char* end = text_.data() + text_.size();
140 njob_ = 0;
141 // Push() no longer checks ShouldVisit(),
142 // so we must perform the check ourselves.
143 if (ShouldVisit(id0, p0))
144 Push(id0, p0);
145 while (njob_ > 0) {
146 // Pop job off stack.
147 --njob_;
148 int id = job_[njob_].id;
149 int& rle = job_[njob_].rle;
150 const char* p = job_[njob_].p;
151
152 if (id < 0) {
153 // Undo the Capture.
154 cap_[prog_->inst(-id)->cap()] = p;
155 continue;
156 }
157
158 if (rle > 0) {
159 p += rle;
160 // Revivify job on stack.
161 --rle;
162 ++njob_;
163 }
164
165 Loop:
166 // Visit id, p.
167 Prog::Inst* ip = prog_->inst(id);
168 switch (ip->opcode()) {
169 default:
170 LOG(DFATAL) << "Unexpected opcode: " << ip->opcode();
171 return false;
172
173 case kInstFail:
174 break;
175
176 case kInstAltMatch:
177 if (ip->greedy(prog_)) {
178 // out1 is the Match instruction.
179 id = ip->out1();
180 p = end;
181 goto Loop;
182 }
183 if (longest_) {
184 // ip must be non-greedy...
185 // out is the Match instruction.
186 id = ip->out();
187 p = end;
188 goto Loop;
189 }
190 goto Next;
191
192 case kInstByteRange: {
193 int c = -1;
194 if (p < end)
195 c = *p & 0xFF;
196 if (!ip->Matches(c))
197 goto Next;
198
199 if (ip->hint() != 0)
200 Push(id+ip->hint(), p); // try the next when we're done
201 id = ip->out();
202 p++;
203 goto CheckAndLoop;
204 }
205
206 case kInstCapture:
207 if (!ip->last())
208 Push(id+1, p); // try the next when we're done
209
210 if (0 <= ip->cap() && ip->cap() < cap_.size()) {
211 // Capture p to register, but save old value first.
212 Push(-id, cap_[ip->cap()]); // undo when we're done
213 cap_[ip->cap()] = p;
214 }
215
216 id = ip->out();
217 goto CheckAndLoop;
218
219 case kInstEmptyWidth:
220 if (ip->empty() & ~Prog::EmptyFlags(context_, p))
221 goto Next;
222
223 if (!ip->last())
224 Push(id+1, p); // try the next when we're done
225 id = ip->out();
226 goto CheckAndLoop;
227
228 case kInstNop:
229 if (!ip->last())
230 Push(id+1, p); // try the next when we're done
231 id = ip->out();
232
233 CheckAndLoop:
234 // Sanity check: id is the head of its list, which must
235 // be the case if id-1 is the last of *its* list. :)
236 DCHECK(id == 0 || prog_->inst(id-1)->last());
237 if (ShouldVisit(id, p))
238 goto Loop;
239 break;
240
241 case kInstMatch: {
242 if (endmatch_ && p != end)
243 goto Next;
244
245 // We found a match. If the caller doesn't care
246 // where the match is, no point going further.
247 if (nsubmatch_ == 0)
248 return true;
249
250 // Record best match so far.
251 // Only need to check end point, because this entire
252 // call is only considering one start position.
253 matched = true;
254 cap_[1] = p;
255 if (submatch_[0].data() == NULL ||
256 (longest_ && p > submatch_[0].data() + submatch_[0].size())) {
257 for (int i = 0; i < nsubmatch_; i++)
258 submatch_[i] = absl::string_view(
259 cap_[2 * i],
260 static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
261 }
262
263 // If going for first match, we're done.
264 if (!longest_)
265 return true;
266
267 // If we used the entire text, no longer match is possible.
268 if (p == end)
269 return true;
270
271 // Otherwise, continue on in hope of a longer match.
272 // Note the absence of the ShouldVisit() check here
273 // due to execution remaining in the same list.
274 Next:
275 if (!ip->last()) {
276 id++;
277 goto Loop;
278 }
279 break;
280 }
281 }
282 }
283 return matched;
284}
285
286// Search text (within context) for prog_.
287bool BitState::Search(absl::string_view text, absl::string_view context,
288 bool anchored, bool longest, absl::string_view* submatch,
289 int nsubmatch) {
290 // Search parameters.
291 text_ = text;
292 context_ = context;
293 if (context_.data() == NULL)
294 context_ = text;
295 if (prog_->anchor_start() && BeginPtr(context_) != BeginPtr(text))
296 return false;
297 if (prog_->anchor_end() && EndPtr(context_) != EndPtr(text))
298 return false;
299 anchored_ = anchored || prog_->anchor_start();
300 longest_ = longest || prog_->anchor_end();
301 endmatch_ = prog_->anchor_end();
302 submatch_ = submatch;
303 nsubmatch_ = nsubmatch;
304 for (int i = 0; i < nsubmatch_; i++)
305 submatch_[i] = absl::string_view();
306
307 // Allocate scratch space.
308 int nvisited = prog_->list_count() * static_cast<int>(text.size()+1);
309 nvisited = (nvisited + kVisitedBits-1) / kVisitedBits;
310 visited_ = PODArray<uint64_t>(nvisited);
311 memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
312
313 int ncap = 2*nsubmatch;
314 if (ncap < 2)
315 ncap = 2;
316 cap_ = PODArray<const char*>(ncap);
317 memset(cap_.data(), 0, ncap*sizeof cap_[0]);
318
319 // When sizeof(Job) == 16, we start with a nice round 1KiB. :)
320 job_ = PODArray<Job>(64);
321
322 // Anchored search must start at text.begin().
323 if (anchored_) {
324 cap_[0] = text.data();
325 return TrySearch(prog_->start(), text.data());
326 }
327
328 // Unanchored search, starting from each possible text position.
329 // Notice that we have to try the empty string at the end of
330 // the text, so the loop condition is p <= text.end(), not p < text.end().
331 // This looks like it's quadratic in the size of the text,
332 // but we are not clearing visited_ between calls to TrySearch,
333 // so no work is duplicated and it ends up still being linear.
334 const char* etext = text.data() + text.size();
335 for (const char* p = text.data(); p <= etext; p++) {
336 // Try to use prefix accel (e.g. memchr) to skip ahead.
337 if (p < etext && prog_->can_prefix_accel()) {
338 p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext - p));
339 if (p == NULL)
340 p = etext;
341 }
342
343 cap_[0] = p;
344 if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
345 return true;
346 // Avoid invoking undefined behavior (arithmetic on a null pointer)
347 // by simply not continuing the loop.
348 if (p == NULL)
349 break;
350 }
351 return false;
352}
353
354// Bit-state search.
355bool Prog::SearchBitState(absl::string_view text, absl::string_view context,
356 Anchor anchor, MatchKind kind,
357 absl::string_view* match, int nmatch) {
358 // If full match, we ask for an anchored longest match
359 // and then check that match[0] == text.
360 // So make sure match[0] exists.
361 absl::string_view sp0;
362 if (kind == kFullMatch) {
363 anchor = kAnchored;
364 if (nmatch < 1) {
365 match = &sp0;
366 nmatch = 1;
367 }
368 }
369
370 // Run the search.
371 BitState b(this);
372 bool anchored = anchor == kAnchored;
373 bool longest = kind != kFirstMatch;
374 if (!b.Search(text, context, anchored, longest, match, nmatch))
375 return false;
376 if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text))
377 return false;
378 return true;
379}
380
381} // namespace re2
382